Spaces:
Running
Running
Katock
commited on
Commit
·
8c35828
1
Parent(s):
dc6504c
Update app.py
Browse files
app.py
CHANGED
@@ -20,16 +20,8 @@ limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingfac
|
|
20 |
|
21 |
|
22 |
def create_fn(model, spk):
|
23 |
-
def
|
24 |
-
|
25 |
-
return "请先上传音频", None
|
26 |
-
sampling_rate, audio = input_audio
|
27 |
-
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
28 |
-
if len(audio.shape) > 1:
|
29 |
-
audio = librosa.to_mono(audio.transpose(1, 0))
|
30 |
-
temp_path = "temp.wav"
|
31 |
-
soundfile.write(temp_path, audio, sampling_rate, format="wav")
|
32 |
-
out_audio = model.slice_inference(raw_audio_path=temp_path,
|
33 |
spk=spk,
|
34 |
slice_db=-40,
|
35 |
cluster_infer_ratio=0,
|
@@ -38,36 +30,35 @@ def create_fn(model, spk):
|
|
38 |
tran=vc_transform,
|
39 |
f0_predictor=f0p,
|
40 |
auto_predict_f0=auto_f0)
|
41 |
-
os.remove(
|
42 |
return 44100, out_audio
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
|
|
|
|
|
45 |
voice = "zh-CN-XiaoyiNeural" if gender == '女' else "zh-CN-YunxiNeural"
|
46 |
-
|
47 |
-
|
48 |
-
ratestr = "+{:.0%}".format(tts_rate)
|
49 |
-
elif tts_rate < 0:
|
50 |
-
ratestr = "{:.0%}".format(tts_rate) # 减号自带
|
51 |
-
|
52 |
p = subprocess.Popen("edge-tts " +
|
53 |
" --text " + input_text +
|
54 |
-
" --write-media " +
|
55 |
" --voice " + voice +
|
56 |
" --rate=" + ratestr, shell=True,
|
57 |
stdout=subprocess.PIPE,
|
58 |
stdin=subprocess.PIPE)
|
59 |
p.wait()
|
60 |
-
|
61 |
-
spk=spk,
|
62 |
-
slice_db=-40,
|
63 |
-
cluster_infer_ratio=0,
|
64 |
-
noice_scale=0.4,
|
65 |
-
clip_seconds=20,
|
66 |
-
tran=vc_transform,
|
67 |
-
f0_predictor=f0p,
|
68 |
-
auto_predict_f0=auto_f0)
|
69 |
-
os.remove(output_file)
|
70 |
-
return 44100, out_audio
|
71 |
|
72 |
return svc_fn, tts_fn
|
73 |
|
@@ -104,10 +95,11 @@ if __name__ == '__main__':
|
|
104 |
auto_f0 = gr.Checkbox(label="自动音高预测 (正常说话可选)", value=False)
|
105 |
f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
|
106 |
choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
|
107 |
-
|
|
|
108 |
else:
|
109 |
-
|
110 |
-
|
111 |
with gr.Row():
|
112 |
gender = gr.Radio(label='说话人性别', value='男', choices=['男', '女'])
|
113 |
tts_rate = gr.Number(label='语速 (正负百分比)', value=0)
|
@@ -117,6 +109,9 @@ if __name__ == '__main__':
|
|
117 |
f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
|
118 |
choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
|
119 |
tts_submit = gr.Button("生成", variant="primary")
|
|
|
|
|
|
|
120 |
|
121 |
with gr.Column():
|
122 |
gr.Markdown(
|
@@ -125,6 +120,5 @@ if __name__ == '__main__':
|
|
125 |
'</div>'
|
126 |
)
|
127 |
vc_output = gr.Audio(label="输出音频")
|
128 |
-
|
129 |
-
tts_submit.click(tts_fn, [text_input, gender, tts_rate, vc_transform, auto_f0, f0_predictor], vc_output)
|
130 |
app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
|
|
|
20 |
|
21 |
|
22 |
def create_fn(model, spk):
|
23 |
+
def svc_infer(audio_path, vc_transform, f0p, auto_f0):
|
24 |
+
out_audio = model.slice_inference(raw_audio_path=audio_path,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
spk=spk,
|
26 |
slice_db=-40,
|
27 |
cluster_infer_ratio=0,
|
|
|
30 |
tran=vc_transform,
|
31 |
f0_predictor=f0p,
|
32 |
auto_predict_f0=auto_f0)
|
33 |
+
os.remove(audio_path)
|
34 |
return 44100, out_audio
|
35 |
|
36 |
+
def svc_fn(input_audio, vc_transform, auto_f0, f0p):
|
37 |
+
if input_audio is None:
|
38 |
+
return 0, None
|
39 |
+
sampling_rate, audio = input_audio
|
40 |
+
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
41 |
+
if len(audio.shape) > 1:
|
42 |
+
audio = librosa.to_mono(audio.transpose(1, 0))
|
43 |
+
temp_path = "temp.wav"
|
44 |
+
soundfile.write(temp_path, audio, sampling_rate, format="wav")
|
45 |
+
return svc_infer(temp_path, vc_transform, auto_f0, f0p)
|
46 |
+
|
47 |
def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
|
48 |
+
if input_text == '':
|
49 |
+
return 0, None
|
50 |
voice = "zh-CN-XiaoyiNeural" if gender == '女' else "zh-CN-YunxiNeural"
|
51 |
+
ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
|
52 |
+
temp_path = "temp.wav"
|
|
|
|
|
|
|
|
|
53 |
p = subprocess.Popen("edge-tts " +
|
54 |
" --text " + input_text +
|
55 |
+
" --write-media " + temp_path +
|
56 |
" --voice " + voice +
|
57 |
" --rate=" + ratestr, shell=True,
|
58 |
stdout=subprocess.PIPE,
|
59 |
stdin=subprocess.PIPE)
|
60 |
p.wait()
|
61 |
+
return svc_infer(temp_path, vc_transform, auto_f0, f0p)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
return svc_fn, tts_fn
|
64 |
|
|
|
95 |
auto_f0 = gr.Checkbox(label="自动音高预测 (正常说话可选)", value=False)
|
96 |
f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
|
97 |
choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
|
98 |
+
svc_submit = gr.Button("生成", variant="primary")
|
99 |
+
svc_submit.click(svc_fn, [svc_input, vc_transform, auto_f0, f0_predictor], vc_output)
|
100 |
else:
|
101 |
+
tts_input = gr.Textbox(label='说话内容', value='',
|
102 |
+
placeholder='请输入说话内容,(已支持无限长内容,处理时间约为说完原内容时间的5倍)')
|
103 |
with gr.Row():
|
104 |
gender = gr.Radio(label='说话人性别', value='男', choices=['男', '女'])
|
105 |
tts_rate = gr.Number(label='语速 (正负百分比)', value=0)
|
|
|
109 |
f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
|
110 |
choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
|
111 |
tts_submit = gr.Button("生成", variant="primary")
|
112 |
+
tts_submit.click(tts_fn,
|
113 |
+
[tts_input, gender, tts_rate, vc_transform, auto_f0, f0_predictor],
|
114 |
+
vc_output)
|
115 |
|
116 |
with gr.Column():
|
117 |
gr.Markdown(
|
|
|
120 |
'</div>'
|
121 |
)
|
122 |
vc_output = gr.Audio(label="输出音频")
|
123 |
+
|
|
|
124 |
app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
|