Spaces:
Running
Running
File size: 6,789 Bytes
533346a 77ab97d be3fda3 5bf89e0 f2b662d 53a560c 77ab97d 96a9860 24bc27d 533346a 794e885 be3fda3 77ab97d be3fda3 88d3449 96a9860 88d3449 24bc27d be3fda3 a73a5ce 77ab97d 605d345 77ab97d 605d345 24bc27d 77ab97d 4e77733 5efa479 e9480f5 4e77733 a9ef482 77ab97d 605d345 8c35828 8b14656 8c35828 77ab97d 88d3449 8c35828 24c7187 6c20679 96a9860 77ab97d 605d345 77ab97d 794e885 2027697 fd27546 be3fda3 b1f74f6 a73a5ce be3fda3 24c7187 61f8e05 24c7187 be3fda3 2027697 be3fda3 e4c7b45 24c7187 88d3449 883f29d c0c81a2 e4c7b45 c0c81a2 665a838 c0c81a2 96a9860 f644d81 e4c7b45 f2b662d be3fda3 ecafef5 aef0422 2027697 533f489 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import argparse
import logging
import os
import re
import gradio.processing_utils as gr_pu
import gradio as gr
import librosa
import numpy as np
import soundfile
from scipy.io import wavfile
import tempfile
import edge_tts
import utils
from inference.infer_tool import Svc
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
sampling_rate = 44100
tts_voice = {
"中文男": "zh-CN-YunxiNeural",
"中文女": "zh-CN-XiaoyiNeural",
"英文男": "en-US-EricNeural",
"英文女": "en-US-AnaNeural"
}
hubert_dict = {
"vec768l12": utils.get_speech_encoder("vec768l12", device="cpu"),
"vec256l9": utils.get_speech_encoder("vec256l9", device="cpu")
}
def create_fn(model, spk):
def svc_fn(input_audio, vc_transform, auto_f0, f0p):
if input_audio is None:
return 0, None
sr, audio = input_audio
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
temp_path = "temp.wav"
soundfile.write(temp_path, audio, sampling_rate, format="wav")
model.hubert_model = hubert_dict[model.speech_encoder]
out_audio = model.slice_inference(raw_audio_path=temp_path,
spk=spk,
slice_db=-40,
cluster_infer_ratio=0,
noice_scale=0.4,
clip_seconds=10,
tran=vc_transform,
f0_predictor=f0p,
auto_predict_f0=auto_f0)
model.clear_empty()
os.remove(temp_path)
return sampling_rate, out_audio
async def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
if input_text == '':
return 0, None
input_text = re.sub(r"[\n\,\(\) ]", "", input_text)
voice = tts_voice[gender]
ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
communicate = edge_tts.Communicate(text=input_text, voice=voice, rate=ratestr)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
temp_path = tmp_file.name
await communicate.save(temp_path)
audio, sr = librosa.load(temp_path)
audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
os.remove(temp_path)
temp_path = "temp.wav"
wavfile.write(temp_path, sampling_rate, (audio * np.iinfo(np.int16).max).astype(np.int16))
sr, audio = gr_pu.audio_from_file(temp_path)
input_audio = (sampling_rate, audio)
return svc_fn(input_audio, vc_transform, auto_f0, f0p)
return svc_fn, tts_fn
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--device', type=str, default='cpu')
parser.add_argument('--api', action="store_true", default=False)
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
args = parser.parse_args()
models = []
for f in os.listdir("models"):
name = f
model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config_{f}.json", device=args.device)
cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else f"models/{f}/cover.jpg"
models.append((name, cover, create_fn(model, name)))
with gr.Blocks() as app:
gr.Markdown(
"""
# <center> 圣安地列斯角色语音生成
## <center> 模型作者:B站[Cyber蝈蝈总](https://space.bilibili.com/37706580)
#### <center> 传送门[GTAVC](https://huggingface.shushu.icu/spaces/GroveStreet/GTAVC_SOVITS);[GTAV](https://huggingface.shushu.icu/spaces/GroveStreet/GTAV_SOVITS)
<center> 使用此资源创作的作品请标出处,CJ有两个模型,carl1更清晰,carl2音域广
"""
)
with gr.Tabs():
for (name, cover, (svc_fn, tts_fn)) in models:
with gr.TabItem(name):
with gr.Row():
with gr.Column():
with gr.Row():
vc_transform = gr.Number(label="音高调整 (正负半音,12为1个八度)", value=0)
f0_predictor = gr.Radio(label="f0预测器 (推荐rmvpe)",
choices=['crepe', 'harvest', 'rmvpe'], value='rmvpe')
auto_f0 = gr.Checkbox(label="自动音高预测 (文本转语音或讲话可选,会导致唱歌跑调)",
value=False)
with gr.Tabs():
with gr.TabItem('语音转语音'):
svc_input = gr.Audio(
label="上传干声 (已支持无限长音频,处理时间约为原音频时间的5倍)")
svc_submit = gr.Button("生成", variant="primary")
with gr.TabItem('文本转语音'):
tts_input = gr.Textbox(label='说话内容', value='',
placeholder='已支持无限长内容,处理时间约为说完原内容时间的5倍')
with gr.Row():
gender = gr.Radio(label='说话人性别 (男音调低,女音调高)', value='中文男',
choices=['中文男', '中文女', '英文男', '英文女'])
tts_rate = gr.Number(label='语速 (正负, 单位百分比)', value=0)
tts_submit = gr.Button("生成", variant="primary")
with gr.Column():
gr.Image(cover, width=400, height=400)
vc_output = gr.Audio(label="输出音频")
svc_submit.click(svc_fn, [svc_input, vc_transform, auto_f0, f0_predictor], vc_output)
tts_submit.click(tts_fn, [tts_input, gender, tts_rate, vc_transform, auto_f0, f0_predictor],
vc_output)
app.queue(api_open=args.api).launch(share=args.share)
|