Spaces:

GroveStreet
/

GTA_SOVITS

Running

File size: 6,789 Bytes

533346a
 
 
77ab97d
 
be3fda3
5bf89e0
f2b662d
53a560c
77ab97d
96a9860
 
24bc27d
533346a
 
794e885
be3fda3
 
 
 
 
77ab97d
be3fda3
88d3449
96a9860
 
 
 
88d3449
 
24bc27d
 
 
 
 
be3fda3
a73a5ce
77ab97d
 
 
 
 
605d345
77ab97d
 
 
605d345
24bc27d
 
77ab97d
4e77733
5efa479
 
 
e9480f5
4e77733
 
 
a9ef482
77ab97d
605d345
8c35828
8b14656
8c35828
 
77ab97d
88d3449
8c35828
24c7187
6c20679
 
96a9860
 
77ab97d
 
 
 
 
 
605d345
77ab97d
794e885
2027697
fd27546
 
be3fda3
 
 
 
 
 
 
 
 
 
b1f74f6
a73a5ce
be3fda3
 
24c7187
 
 
61f8e05
24c7187
 
be3fda3
 
2027697
be3fda3
 
 
e4c7b45
 
24c7187
 
88d3449
883f29d
c0c81a2
 
 
 
e4c7b45
 
c0c81a2
 
665a838
c0c81a2
96a9860
 
f644d81
e4c7b45
f2b662d
be3fda3
ecafef5
aef0422
2027697
 
 
533f489

import argparse
import logging
import os
import re
import gradio.processing_utils as gr_pu
import gradio as gr
import librosa
import numpy as np
import soundfile
from scipy.io import wavfile
import tempfile
import edge_tts
import utils

from inference.infer_tool import Svc

logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)

sampling_rate = 44100

tts_voice = {
    "中文男": "zh-CN-YunxiNeural",
    "中文女": "zh-CN-XiaoyiNeural",
    "英文男": "en-US-EricNeural",
    "英文女": "en-US-AnaNeural"
}

hubert_dict = {
    "vec768l12": utils.get_speech_encoder("vec768l12", device="cpu"),
    "vec256l9": utils.get_speech_encoder("vec256l9", device="cpu")
}


def create_fn(model, spk):
    def svc_fn(input_audio, vc_transform, auto_f0, f0p):
        if input_audio is None:
            return 0, None
        sr, audio = input_audio
        audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
        audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
        if len(audio.shape) > 1:
            audio = librosa.to_mono(audio.transpose(1, 0))
        temp_path = "temp.wav"
        soundfile.write(temp_path, audio, sampling_rate, format="wav")

        model.hubert_model = hubert_dict[model.speech_encoder]
        out_audio = model.slice_inference(raw_audio_path=temp_path,
                                          spk=spk,
                                          slice_db=-40,
                                          cluster_infer_ratio=0,
                                          noice_scale=0.4,
                                          clip_seconds=10,
                                          tran=vc_transform,
                                          f0_predictor=f0p,
                                          auto_predict_f0=auto_f0)
        model.clear_empty()
        os.remove(temp_path)
        return sampling_rate, out_audio

    async def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
        if input_text == '':
            return 0, None
        input_text = re.sub(r"[\n\,\(\) ]", "", input_text)
        voice = tts_voice[gender]
        ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
        communicate = edge_tts.Communicate(text=input_text, voice=voice, rate=ratestr)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            temp_path = tmp_file.name
        await communicate.save(temp_path)

        audio, sr = librosa.load(temp_path)
        audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
        os.remove(temp_path)
        temp_path = "temp.wav"
        wavfile.write(temp_path, sampling_rate, (audio * np.iinfo(np.int16).max).astype(np.int16))
        sr, audio = gr_pu.audio_from_file(temp_path)
        input_audio = (sampling_rate, audio)
        return svc_fn(input_audio, vc_transform, auto_f0, f0p)

    return svc_fn, tts_fn


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', type=str, default='cpu')
    parser.add_argument('--api', action="store_true", default=False)
    parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
    args = parser.parse_args()
    models = []
    for f in os.listdir("models"):
        name = f
        model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config_{f}.json", device=args.device)
        cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else f"models/{f}/cover.jpg"
        models.append((name, cover, create_fn(model, name)))
    with gr.Blocks() as app:
        gr.Markdown(
            """
            # <center> 圣安地列斯角色语音生成
            ## <center> 模型作者：B站[Cyber蝈蝈总](https://space.bilibili.com/37706580)
            #### <center> 传送门[GTAVC](https://huggingface.shushu.icu/spaces/GroveStreet/GTAVC_SOVITS)；[GTAV](https://huggingface.shushu.icu/spaces/GroveStreet/GTAV_SOVITS)
            <center> 使用此资源创作的作品请标出处，CJ有两个模型，carl1更清晰，carl2音域广
            """
        )
        with gr.Tabs():
            for (name, cover, (svc_fn, tts_fn)) in models:
                with gr.TabItem(name):
                    with gr.Row():
                        with gr.Column():
                            with gr.Row():
                                vc_transform = gr.Number(label="音高调整 (正负半音，12为1个八度)", value=0)
                                f0_predictor = gr.Radio(label="f0预测器 (推荐rmvpe)",
                                                        choices=['crepe', 'harvest', 'rmvpe'], value='rmvpe')
                            auto_f0 = gr.Checkbox(label="自动音高预测 (文本转语音或讲话可选,会导致唱歌跑调)",
                                                  value=False)
                            with gr.Tabs():
                                with gr.TabItem('语音转语音'):
                                    svc_input = gr.Audio(
                                        label="上传干声 (已支持无限长音频，处理时间约为原音频时间的5倍)")
                                    svc_submit = gr.Button("生成", variant="primary")

                                with gr.TabItem('文本转语音'):
                                    tts_input = gr.Textbox(label='说话内容', value='',
                                                           placeholder='已支持无限长内容，处理时间约为说完原内容时间的5倍')
                                    with gr.Row():
                                        gender = gr.Radio(label='说话人性别 (男音调低，女音调高)', value='中文男',
                                                          choices=['中文男', '中文女', '英文男', '英文女'])
                                        tts_rate = gr.Number(label='语速 (正负, 单位百分比)', value=0)
                                    tts_submit = gr.Button("生成", variant="primary")

                        with gr.Column():
                            gr.Image(cover, width=400, height=400)
                            vc_output = gr.Audio(label="输出音频")
                    svc_submit.click(svc_fn, [svc_input, vc_transform, auto_f0, f0_predictor], vc_output)
                    tts_submit.click(tts_fn, [tts_input, gender, tts_rate, vc_transform, auto_f0, f0_predictor],
                                     vc_output)
        app.queue(api_open=args.api).launch(share=args.share)