Spaces:
Running
Running
Katock
commited on
Commit
·
77ab97d
1
Parent(s):
c0c81a2
Update app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
import argparse
|
2 |
-
import io
|
3 |
import logging
|
4 |
import os
|
|
|
5 |
import subprocess
|
|
|
6 |
import gradio as gr
|
7 |
-
import gradio.processing_utils as gr_processing_utils
|
8 |
import librosa
|
9 |
import numpy as np
|
10 |
import soundfile
|
|
|
11 |
|
12 |
from inference.infer_tool import Svc
|
13 |
|
@@ -17,11 +18,20 @@ logging.getLogger('urllib3').setLevel(logging.WARNING)
|
|
17 |
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
18 |
|
19 |
limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
|
|
|
20 |
|
21 |
|
22 |
def create_fn(model, spk):
|
23 |
-
def
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
spk=spk,
|
26 |
slice_db=-40,
|
27 |
cluster_infer_ratio=0,
|
@@ -30,23 +40,14 @@ def create_fn(model, spk):
|
|
30 |
tran=vc_transform,
|
31 |
f0_predictor=f0p,
|
32 |
auto_predict_f0=auto_f0)
|
33 |
-
os.remove(
|
34 |
-
return
|
35 |
-
|
36 |
-
def svc_fn(input_audio, vc_transform, auto_f0, f0p):
|
37 |
-
if input_audio is None:
|
38 |
-
return 0, None
|
39 |
-
sampling_rate, audio = input_audio
|
40 |
-
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
41 |
-
if len(audio.shape) > 1:
|
42 |
-
audio = librosa.to_mono(audio.transpose(1, 0))
|
43 |
-
temp_path = "temp.wav"
|
44 |
-
soundfile.write(temp_path, audio, sampling_rate, format="wav")
|
45 |
-
return svc_infer(temp_path, vc_transform, auto_f0, f0p)
|
46 |
|
47 |
def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
|
48 |
if input_text == '':
|
49 |
return 0, None
|
|
|
|
|
50 |
voice = "zh-CN-XiaoyiNeural" if gender == '女' else "zh-CN-YunxiNeural"
|
51 |
ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
|
52 |
temp_path = "temp.wav"
|
@@ -58,7 +59,15 @@ def create_fn(model, spk):
|
|
58 |
stdout=subprocess.PIPE,
|
59 |
stdin=subprocess.PIPE)
|
60 |
p.wait()
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
return svc_fn, tts_fn
|
64 |
|
@@ -88,7 +97,7 @@ if __name__ == '__main__':
|
|
88 |
with gr.Column():
|
89 |
with gr.Row():
|
90 |
vc_transform = gr.Number(label="音高调整 (正负半音,12为一个八度)", value=0)
|
91 |
-
auto_f0 = gr.Checkbox(label="自动音高预测 (
|
92 |
f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
|
93 |
choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
|
94 |
with gr.Tabs():
|
@@ -108,7 +117,7 @@ if __name__ == '__main__':
|
|
108 |
with gr.Column():
|
109 |
gr.Markdown(
|
110 |
'<div align="center">'
|
111 |
-
f'<img style="width:auto;height:
|
112 |
'</div>'
|
113 |
)
|
114 |
vc_output = gr.Audio(label="输出音频")
|
|
|
1 |
import argparse
|
|
|
2 |
import logging
|
3 |
import os
|
4 |
+
import re
|
5 |
import subprocess
|
6 |
+
import gradio.processing_utils as gr_pu
|
7 |
import gradio as gr
|
|
|
8 |
import librosa
|
9 |
import numpy as np
|
10 |
import soundfile
|
11 |
+
from scipy.io import wavfile
|
12 |
|
13 |
from inference.infer_tool import Svc
|
14 |
|
|
|
18 |
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
19 |
|
20 |
limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
|
21 |
+
sampling_rate = 44100
|
22 |
|
23 |
|
24 |
def create_fn(model, spk):
|
25 |
+
def svc_fn(input_audio, vc_transform, auto_f0, f0p):
|
26 |
+
if input_audio is None:
|
27 |
+
return 0, None
|
28 |
+
sr, audio = input_audio
|
29 |
+
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
30 |
+
if len(audio.shape) > 1:
|
31 |
+
audio = librosa.to_mono(audio.transpose(1, 0))
|
32 |
+
temp_path = "temp.wav"
|
33 |
+
soundfile.write(temp_path, audio, sr, format="wav")
|
34 |
+
out_audio = model.slice_inference(raw_audio_path=temp_path,
|
35 |
spk=spk,
|
36 |
slice_db=-40,
|
37 |
cluster_infer_ratio=0,
|
|
|
40 |
tran=vc_transform,
|
41 |
f0_predictor=f0p,
|
42 |
auto_predict_f0=auto_f0)
|
43 |
+
os.remove(temp_path)
|
44 |
+
return sr, out_audio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
|
47 |
if input_text == '':
|
48 |
return 0, None
|
49 |
+
|
50 |
+
input_text = re.sub(r"[\n\,\(\) ]", "", input_text)
|
51 |
voice = "zh-CN-XiaoyiNeural" if gender == '女' else "zh-CN-YunxiNeural"
|
52 |
ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
|
53 |
temp_path = "temp.wav"
|
|
|
59 |
stdout=subprocess.PIPE,
|
60 |
stdin=subprocess.PIPE)
|
61 |
p.wait()
|
62 |
+
|
63 |
+
audio, sr = librosa.load(temp_path)
|
64 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
|
65 |
+
os.remove(temp_path)
|
66 |
+
temp_path = "temp.wav"
|
67 |
+
wavfile.write(temp_path, sampling_rate, (audio * np.iinfo(np.int16).max).astype(np.int16))
|
68 |
+
sr, audio = gr_pu.audio_from_file(temp_path)
|
69 |
+
input_audio = (sr, audio)
|
70 |
+
return svc_fn(input_audio, vc_transform, auto_f0, f0p)
|
71 |
|
72 |
return svc_fn, tts_fn
|
73 |
|
|
|
97 |
with gr.Column():
|
98 |
with gr.Row():
|
99 |
vc_transform = gr.Number(label="音高调整 (正负半音,12为一个八度)", value=0)
|
100 |
+
auto_f0 = gr.Checkbox(label="自动音高预测 (文本转语音可选)", value=False)
|
101 |
f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
|
102 |
choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
|
103 |
with gr.Tabs():
|
|
|
117 |
with gr.Column():
|
118 |
gr.Markdown(
|
119 |
'<div align="center">'
|
120 |
+
f'<img style="width:auto;height:400px;" src="file/{cover}">' if cover else ""
|
121 |
'</div>'
|
122 |
)
|
123 |
vc_output = gr.Audio(label="输出音频")
|