Spaces:
Running
Running
Katock
commited on
Commit
·
a73a5ce
1
Parent(s):
054c3da
tts
Browse files
app.py
CHANGED
@@ -2,7 +2,7 @@ import argparse
|
|
2 |
import io
|
3 |
import logging
|
4 |
import os
|
5 |
-
|
6 |
import gradio as gr
|
7 |
import gradio.processing_utils as gr_processing_utils
|
8 |
import librosa
|
@@ -19,36 +19,16 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
|
19 |
limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
|
20 |
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
#
|
25 |
-
# def audio_postprocess(self, y):
|
26 |
-
# data = audio_postprocess_ori(self, y)
|
27 |
-
# if data is None:
|
28 |
-
# return None
|
29 |
-
# return gr_processing_utils.encode_url_or_file_to_base64(data["name"])
|
30 |
-
|
31 |
-
|
32 |
-
# gr.Audio.postprocess = audio_postprocess
|
33 |
-
|
34 |
-
|
35 |
-
def create_vc_fn(model, spk):
|
36 |
-
def vc_fn(input_audio, vc_transform, auto_f0, f0p):
|
37 |
if input_audio is None:
|
38 |
return "请先上传音频", None
|
39 |
sampling_rate, audio = input_audio
|
40 |
-
|
41 |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
42 |
if len(audio.shape) > 1:
|
43 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
44 |
-
|
45 |
-
# raw_audio_path = io.BytesIO()
|
46 |
-
# soundfile.write(raw_audio_path, audio, sampling_rate, format="wav")
|
47 |
-
# raw_audio_path.seek(0)
|
48 |
-
|
49 |
temp_path = "temp.wav"
|
50 |
soundfile.write(temp_path, audio, sampling_rate, format="wav")
|
51 |
-
|
52 |
out_audio = model.slice_inference(raw_audio_path=temp_path,
|
53 |
spk=spk,
|
54 |
slice_db=-40,
|
@@ -58,12 +38,38 @@ def create_vc_fn(model, spk):
|
|
58 |
tran=vc_transform,
|
59 |
f0_predictor=f0p,
|
60 |
auto_predict_f0=auto_f0)
|
61 |
-
|
62 |
os.remove(temp_path)
|
|
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
return 44100, out_audio
|
65 |
|
66 |
-
return
|
67 |
|
68 |
|
69 |
if __name__ == '__main__':
|
@@ -78,24 +84,36 @@ if __name__ == '__main__':
|
|
78 |
name = f
|
79 |
model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config_{f}.json", device=args.device)
|
80 |
cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else None
|
81 |
-
models.append((name, cover,
|
82 |
with gr.Blocks() as app:
|
83 |
gr.Markdown(
|
84 |
"# <center> GTASA人物SOVITS4.1(升级中,可能有bug)\n"
|
85 |
"## <center> 模型作者:B站Cyber蝈蝈总\n"
|
86 |
-
"<center> 使用此处资源创作的作品,请显著标明模型出处(B站Cyber
|
87 |
)
|
88 |
with gr.Tabs():
|
89 |
-
for (name, cover,
|
90 |
with gr.TabItem(name):
|
91 |
with gr.Row():
|
92 |
with gr.Column():
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
with gr.Column():
|
101 |
gr.Markdown(
|
@@ -104,5 +122,6 @@ if __name__ == '__main__':
|
|
104 |
'</div>'
|
105 |
)
|
106 |
vc_output = gr.Audio(label="输出音频")
|
107 |
-
vc_submit.click(
|
|
|
108 |
app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
|
|
|
2 |
import io
|
3 |
import logging
|
4 |
import os
|
5 |
+
import subprocess
|
6 |
import gradio as gr
|
7 |
import gradio.processing_utils as gr_processing_utils
|
8 |
import librosa
|
|
|
19 |
limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
|
20 |
|
21 |
|
22 |
+
def create_fn(model, spk):
|
23 |
+
def svc_fn(input_audio, vc_transform, auto_f0, f0p):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
if input_audio is None:
|
25 |
return "请先上传音频", None
|
26 |
sampling_rate, audio = input_audio
|
|
|
27 |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
28 |
if len(audio.shape) > 1:
|
29 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
|
|
|
|
|
|
|
|
|
|
30 |
temp_path = "temp.wav"
|
31 |
soundfile.write(temp_path, audio, sampling_rate, format="wav")
|
|
|
32 |
out_audio = model.slice_inference(raw_audio_path=temp_path,
|
33 |
spk=spk,
|
34 |
slice_db=-40,
|
|
|
38 |
tran=vc_transform,
|
39 |
f0_predictor=f0p,
|
40 |
auto_predict_f0=auto_f0)
|
|
|
41 |
os.remove(temp_path)
|
42 |
+
return 44100, out_audio
|
43 |
|
44 |
+
def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
|
45 |
+
voice = "zh-CN-XiaoyiNeural" if gender == '女' else "zh-CN-YunxiNeural"
|
46 |
+
output_file = "temp.wav"
|
47 |
+
if tts_rate >= 0:
|
48 |
+
ratestr = "+{:.0%}".format(tts_rate)
|
49 |
+
elif tts_rate < 0:
|
50 |
+
ratestr = "{:.0%}".format(tts_rate) # 减号自带
|
51 |
+
|
52 |
+
p = subprocess.Popen("edge-tts " +
|
53 |
+
" --text " + input_text +
|
54 |
+
" --write-media " + output_file +
|
55 |
+
" --voice " + voice +
|
56 |
+
" --rate=" + ratestr, shell=True,
|
57 |
+
stdout=subprocess.PIPE,
|
58 |
+
stdin=subprocess.PIPE)
|
59 |
+
p.wait()
|
60 |
+
out_audio = model.slice_inference(raw_audio_path=output_file,
|
61 |
+
spk=spk,
|
62 |
+
slice_db=-40,
|
63 |
+
cluster_infer_ratio=0,
|
64 |
+
noice_scale=0.4,
|
65 |
+
clip_seconds=20,
|
66 |
+
tran=vc_transform,
|
67 |
+
f0_predictor=f0p,
|
68 |
+
auto_predict_f0=auto_f0)
|
69 |
+
os.remove(output_file)
|
70 |
return 44100, out_audio
|
71 |
|
72 |
+
return svc_fn, tts_fn
|
73 |
|
74 |
|
75 |
if __name__ == '__main__':
|
|
|
84 |
name = f
|
85 |
model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config_{f}.json", device=args.device)
|
86 |
cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else None
|
87 |
+
models.append((name, cover, create_fn(model, name)))
|
88 |
with gr.Blocks() as app:
|
89 |
gr.Markdown(
|
90 |
"# <center> GTASA人物SOVITS4.1(升级中,可能有bug)\n"
|
91 |
"## <center> 模型作者:B站Cyber蝈蝈总\n"
|
92 |
+
"<center> 使用此处资源创作的作品,请显著标明模型出处(B站Cyber蝈蝈总)\n"
|
93 |
)
|
94 |
with gr.Tabs():
|
95 |
+
for (name, cover, svc_fn, tts_fn) in models:
|
96 |
with gr.TabItem(name):
|
97 |
with gr.Row():
|
98 |
with gr.Column():
|
99 |
+
mode = gr.Radio(label='模式', value='音频转音频', choices=['文字转音频', '音频转音频'])
|
100 |
+
if mode == '音频转音频':
|
101 |
+
svc_input = gr.Audio(label="上传干声 (已支持无限长音频,处理时间约为原音频时间的5倍)")
|
102 |
+
vc_transform = gr.Number(label="音高调整 (支持正负半音,12为一个八度)", value=0)
|
103 |
+
auto_f0 = gr.Checkbox(label="自动音高预测 (正常说话可选)", value=False)
|
104 |
+
f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
|
105 |
+
choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
|
106 |
+
vc_submit = gr.Button("生成", variant="primary")
|
107 |
+
else:
|
108 |
+
text_input = gr.Textbox(label='说话内容', value='',
|
109 |
+
placeholder='请输入说话内容,(已支持无限长内容,处理时间约为说完原内容时间的5倍)')
|
110 |
+
gender = gr.Radio(label='说话人性别', value='男', choices=['男', '女'])
|
111 |
+
tts_rate = gr.Number(label='语速(正负百分比)', value=0)
|
112 |
+
vc_transform = gr.Number(label="音高调整 (支持正负半音,12为一个八度)", value=0)
|
113 |
+
auto_f0 = gr.Checkbox(label="自动音高预测 (正常说话可选)", value=False)
|
114 |
+
f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
|
115 |
+
choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
|
116 |
+
tts_submit = gr.Button("生成", variant="primary")
|
117 |
|
118 |
with gr.Column():
|
119 |
gr.Markdown(
|
|
|
122 |
'</div>'
|
123 |
)
|
124 |
vc_output = gr.Audio(label="输出音频")
|
125 |
+
vc_submit.click(svc_fn, [svc_input, vc_transform, auto_f0, f0_predictor], vc_output)
|
126 |
+
tts_submit.click(tts_fn, [text_input, gender, tts_rate, vc_transform, auto_f0, f0_predictor], vc_output)
|
127 |
app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
|