Katock commited on
Commit
77ab97d
·
1 Parent(s): c0c81a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -20
app.py CHANGED
@@ -1,13 +1,14 @@
1
  import argparse
2
- import io
3
  import logging
4
  import os
 
5
  import subprocess
 
6
  import gradio as gr
7
- import gradio.processing_utils as gr_processing_utils
8
  import librosa
9
  import numpy as np
10
  import soundfile
 
11
 
12
  from inference.infer_tool import Svc
13
 
@@ -17,11 +18,20 @@ logging.getLogger('urllib3').setLevel(logging.WARNING)
17
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
18
 
19
  limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
 
20
 
21
 
22
  def create_fn(model, spk):
23
- def svc_infer(audio_path, vc_transform, f0p, auto_f0):
24
- out_audio = model.slice_inference(raw_audio_path=audio_path,
 
 
 
 
 
 
 
 
25
  spk=spk,
26
  slice_db=-40,
27
  cluster_infer_ratio=0,
@@ -30,23 +40,14 @@ def create_fn(model, spk):
30
  tran=vc_transform,
31
  f0_predictor=f0p,
32
  auto_predict_f0=auto_f0)
33
- os.remove(audio_path)
34
- return 44100, out_audio
35
-
36
- def svc_fn(input_audio, vc_transform, auto_f0, f0p):
37
- if input_audio is None:
38
- return 0, None
39
- sampling_rate, audio = input_audio
40
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
41
- if len(audio.shape) > 1:
42
- audio = librosa.to_mono(audio.transpose(1, 0))
43
- temp_path = "temp.wav"
44
- soundfile.write(temp_path, audio, sampling_rate, format="wav")
45
- return svc_infer(temp_path, vc_transform, auto_f0, f0p)
46
 
47
  def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
48
  if input_text == '':
49
  return 0, None
 
 
50
  voice = "zh-CN-XiaoyiNeural" if gender == '女' else "zh-CN-YunxiNeural"
51
  ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
52
  temp_path = "temp.wav"
@@ -58,7 +59,15 @@ def create_fn(model, spk):
58
  stdout=subprocess.PIPE,
59
  stdin=subprocess.PIPE)
60
  p.wait()
61
- return svc_infer(temp_path, vc_transform, auto_f0, f0p)
 
 
 
 
 
 
 
 
62
 
63
  return svc_fn, tts_fn
64
 
@@ -88,7 +97,7 @@ if __name__ == '__main__':
88
  with gr.Column():
89
  with gr.Row():
90
  vc_transform = gr.Number(label="音高调整 (正负半音,12为一个八度)", value=0)
91
- auto_f0 = gr.Checkbox(label="自动音高预测 (正常说话可选)", value=False)
92
  f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
93
  choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
94
  with gr.Tabs():
@@ -108,7 +117,7 @@ if __name__ == '__main__':
108
  with gr.Column():
109
  gr.Markdown(
110
  '<div align="center">'
111
- f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else ""
112
  '</div>'
113
  )
114
  vc_output = gr.Audio(label="输出音频")
 
1
  import argparse
 
2
  import logging
3
  import os
4
+ import re
5
  import subprocess
6
+ import gradio.processing_utils as gr_pu
7
  import gradio as gr
 
8
  import librosa
9
  import numpy as np
10
  import soundfile
11
+ from scipy.io import wavfile
12
 
13
  from inference.infer_tool import Svc
14
 
 
18
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
19
 
20
  limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
21
+ sampling_rate = 44100
22
 
23
 
24
  def create_fn(model, spk):
25
+ def svc_fn(input_audio, vc_transform, auto_f0, f0p):
26
+ if input_audio is None:
27
+ return 0, None
28
+ sr, audio = input_audio
29
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
30
+ if len(audio.shape) > 1:
31
+ audio = librosa.to_mono(audio.transpose(1, 0))
32
+ temp_path = "temp.wav"
33
+ soundfile.write(temp_path, audio, sr, format="wav")
34
+ out_audio = model.slice_inference(raw_audio_path=temp_path,
35
  spk=spk,
36
  slice_db=-40,
37
  cluster_infer_ratio=0,
 
40
  tran=vc_transform,
41
  f0_predictor=f0p,
42
  auto_predict_f0=auto_f0)
43
+ os.remove(temp_path)
44
+ return sr, out_audio
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
47
  if input_text == '':
48
  return 0, None
49
+
50
+ input_text = re.sub(r"[\n\,\(\) ]", "", input_text)
51
  voice = "zh-CN-XiaoyiNeural" if gender == '女' else "zh-CN-YunxiNeural"
52
  ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
53
  temp_path = "temp.wav"
 
59
  stdout=subprocess.PIPE,
60
  stdin=subprocess.PIPE)
61
  p.wait()
62
+
63
+ audio, sr = librosa.load(temp_path)
64
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
65
+ os.remove(temp_path)
66
+ temp_path = "temp.wav"
67
+ wavfile.write(temp_path, sampling_rate, (audio * np.iinfo(np.int16).max).astype(np.int16))
68
+ sr, audio = gr_pu.audio_from_file(temp_path)
69
+ input_audio = (sr, audio)
70
+ return svc_fn(input_audio, vc_transform, auto_f0, f0p)
71
 
72
  return svc_fn, tts_fn
73
 
 
97
  with gr.Column():
98
  with gr.Row():
99
  vc_transform = gr.Number(label="音高调整 (正负半音,12为一个八度)", value=0)
100
+ auto_f0 = gr.Checkbox(label="自动音高预测 (文本转语音可选)", value=False)
101
  f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
102
  choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
103
  with gr.Tabs():
 
117
  with gr.Column():
118
  gr.Markdown(
119
  '<div align="center">'
120
+ f'<img style="width:auto;height:400px;" src="file/{cover}">' if cover else ""
121
  '</div>'
122
  )
123
  vc_output = gr.Audio(label="输出音频")