Spaces:
Running
Running
File size: 6,714 Bytes
4723c1b 3ad3198 557aafd 4723c1b 557aafd 3ad3198 557aafd 4723c1b 557aafd 3ad3198 4723c1b 719a597 4723c1b 719a597 557aafd 4723c1b 719a597 4723c1b fe495ec 4723c1b 557aafd fe495ec 4723c1b 719a597 4723c1b 9d81ff1 4723c1b 5f770d7 4723c1b 557aafd 4723c1b 5f770d7 4723c1b d2cf0d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import argparse
import logging
import os
import re
import gradio.processing_utils as gr_pu
import gradio as gr
import librosa
import numpy as np
import soundfile
from scipy.io import wavfile
import tempfile
import edge_tts
import utils
from inference.infer_tool import Svc
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
sampling_rate = 44100
tts_voice = {
"中文男": "zh-CN-YunxiNeural",
"中文女": "zh-CN-XiaoyiNeural",
"英文男": "en-US-EricNeural",
"英文女": "en-US-AnaNeural"
}
hubert_dict = {
"vec768l12": utils.get_speech_encoder("vec768l12", device="cpu"),
"vec256l9": utils.get_speech_encoder("vec256l9", device="cpu")
}
def create_fn(model, spk):
def svc_fn(input_audio, vc_transform, auto_f0, f0p):
if input_audio is None:
return 0, None
sr, audio = input_audio
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
temp_path = "temp.wav"
soundfile.write(temp_path, audio, sampling_rate, format="wav")
model.hubert_model = hubert_dict[model.speech_encoder]
out_audio = model.slice_inference(raw_audio_path=temp_path,
spk=spk,
slice_db=-40,
cluster_infer_ratio=0,
noice_scale=0.4,
clip_seconds=10,
tran=vc_transform,
f0_predictor=f0p,
auto_predict_f0=auto_f0)
model.clear_empty()
os.remove(temp_path)
return sampling_rate, out_audio
async def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
if input_text == '':
return 0, None
input_text = re.sub(r"[\n\,\(\) ]", "", input_text)
voice = tts_voice[gender]
ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
communicate = edge_tts.Communicate(text=input_text, voice=voice, rate=ratestr)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
temp_path = tmp_file.name
await communicate.save(temp_path)
audio, sr = librosa.load(temp_path)
audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
os.remove(temp_path)
temp_path = "temp.wav"
wavfile.write(temp_path, sampling_rate, (audio * np.iinfo(np.int16).max).astype(np.int16))
sr, audio = gr_pu.audio_from_file(temp_path)
input_audio = (sampling_rate, audio)
return svc_fn(input_audio, vc_transform, auto_f0, f0p)
return svc_fn, tts_fn
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--device', type=str, default='cpu')
parser.add_argument('--api', action="store_true", default=False)
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
args = parser.parse_args()
models = []
for f in os.listdir("models"):
name = f
model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config_{f}.json", device=args.device)
cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else f"models/{f}/cover.jpg"
models.append((name, cover, create_fn(model, name)))
with gr.Blocks() as app:
gr.Markdown(
"# <center> 罪恶都市角色语音生成\n"
"## <center> 模型作者:B站[Cyber蝈蝈总](https://space.bilibili.com/37706580)\n"
"#### <center> 传送门[GTASA](https://huggingface.shushu.icu/spaces/GroveStreet/GTA_SOVITS);[GTAV](https://huggingface.shushu.icu/spaces/GroveStreet/GTAV_SOVITS)\n"
"<center>使用此资源创作的作品请标明出处"
)
with gr.Tabs():
for (name, cover, (svc_fn, tts_fn)) in models:
with gr.TabItem(name):
with gr.Row():
with gr.Column():
with gr.Row():
vc_transform = gr.Number(label="音高调整 (正负半音,12为1个八度)", value=0)
f0_predictor = gr.Radio(label="f0预测器 (推荐rmvpe)",
choices=['crepe', 'harvest', 'rmvpe'], value='rmvpe')
auto_f0 = gr.Checkbox(label="自动音高预测 (文本转语音或讲话可选,会导致唱歌跑调)",
value=False)
with gr.Tabs():
with gr.TabItem('语音转语音'):
svc_input = gr.Audio(
label="上传干声 (已支持无限长音频,处理时间约为原音频时间的5倍)")
svc_submit = gr.Button("生成", variant="primary")
with gr.TabItem('文本转语音'):
tts_input = gr.Textbox(label='说话内容', value='',
placeholder='已支持无限长内容,处理时间约为说完原内容时间的5倍')
with gr.Row():
gender = gr.Radio(label='说话人性别 (男音调低,女音调高)', value='中文男',
choices=['中文男', '中文女', '英文男', '英文女'])
tts_rate = gr.Number(label='语速 (正负, 单位百分比)', value=0)
tts_submit = gr.Button("生成", variant="primary")
with gr.Column():
gr.Image(cover, width=400, height=400)
vc_output = gr.Audio(label="输出音频")
svc_submit.click(svc_fn, [svc_input, vc_transform, auto_f0, f0_predictor], vc_output)
tts_submit.click(tts_fn, [tts_input, gender, tts_rate, vc_transform, auto_f0, f0_predictor],
vc_output)
app.queue(api_open=args.api).launch(share=args.share)
|