Katock commited on
Commit
8c35828
·
1 Parent(s): dc6504c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -34
app.py CHANGED
@@ -20,16 +20,8 @@ limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingfac
20
 
21
 
22
  def create_fn(model, spk):
23
- def svc_fn(input_audio, vc_transform, auto_f0, f0p):
24
- if input_audio is None:
25
- return "请先上传音频", None
26
- sampling_rate, audio = input_audio
27
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
28
- if len(audio.shape) > 1:
29
- audio = librosa.to_mono(audio.transpose(1, 0))
30
- temp_path = "temp.wav"
31
- soundfile.write(temp_path, audio, sampling_rate, format="wav")
32
- out_audio = model.slice_inference(raw_audio_path=temp_path,
33
  spk=spk,
34
  slice_db=-40,
35
  cluster_infer_ratio=0,
@@ -38,36 +30,35 @@ def create_fn(model, spk):
38
  tran=vc_transform,
39
  f0_predictor=f0p,
40
  auto_predict_f0=auto_f0)
41
- os.remove(temp_path)
42
  return 44100, out_audio
43
 
 
 
 
 
 
 
 
 
 
 
 
44
  def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
 
 
45
  voice = "zh-CN-XiaoyiNeural" if gender == '女' else "zh-CN-YunxiNeural"
46
- output_file = "temp.wav"
47
- if tts_rate >= 0:
48
- ratestr = "+{:.0%}".format(tts_rate)
49
- elif tts_rate < 0:
50
- ratestr = "{:.0%}".format(tts_rate) # 减号自带
51
-
52
  p = subprocess.Popen("edge-tts " +
53
  " --text " + input_text +
54
- " --write-media " + output_file +
55
  " --voice " + voice +
56
  " --rate=" + ratestr, shell=True,
57
  stdout=subprocess.PIPE,
58
  stdin=subprocess.PIPE)
59
  p.wait()
60
- out_audio = model.slice_inference(raw_audio_path=output_file,
61
- spk=spk,
62
- slice_db=-40,
63
- cluster_infer_ratio=0,
64
- noice_scale=0.4,
65
- clip_seconds=20,
66
- tran=vc_transform,
67
- f0_predictor=f0p,
68
- auto_predict_f0=auto_f0)
69
- os.remove(output_file)
70
- return 44100, out_audio
71
 
72
  return svc_fn, tts_fn
73
 
@@ -104,10 +95,11 @@ if __name__ == '__main__':
104
  auto_f0 = gr.Checkbox(label="自动音高预测 (正常说话可选)", value=False)
105
  f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
106
  choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
107
- vc_submit = gr.Button("生成", variant="primary")
 
108
  else:
109
- text_input = gr.Textbox(label='说话内容', value='',
110
- placeholder='请输入说话内容,(已支持无限长内容,处理时间约为说完原内容时间的5倍)')
111
  with gr.Row():
112
  gender = gr.Radio(label='说话人性别', value='男', choices=['男', '女'])
113
  tts_rate = gr.Number(label='语速 (正负百分比)', value=0)
@@ -117,6 +109,9 @@ if __name__ == '__main__':
117
  f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
118
  choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
119
  tts_submit = gr.Button("生成", variant="primary")
 
 
 
120
 
121
  with gr.Column():
122
  gr.Markdown(
@@ -125,6 +120,5 @@ if __name__ == '__main__':
125
  '</div>'
126
  )
127
  vc_output = gr.Audio(label="输出音频")
128
- vc_submit.click(svc_fn, [svc_input, vc_transform, auto_f0, f0_predictor], vc_output)
129
- tts_submit.click(tts_fn, [text_input, gender, tts_rate, vc_transform, auto_f0, f0_predictor], vc_output)
130
  app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
 
20
 
21
 
22
  def create_fn(model, spk):
23
+ def svc_infer(audio_path, vc_transform, f0p, auto_f0):
24
+ out_audio = model.slice_inference(raw_audio_path=audio_path,
 
 
 
 
 
 
 
 
25
  spk=spk,
26
  slice_db=-40,
27
  cluster_infer_ratio=0,
 
30
  tran=vc_transform,
31
  f0_predictor=f0p,
32
  auto_predict_f0=auto_f0)
33
+ os.remove(audio_path)
34
  return 44100, out_audio
35
 
36
+ def svc_fn(input_audio, vc_transform, auto_f0, f0p):
37
+ if input_audio is None:
38
+ return 0, None
39
+ sampling_rate, audio = input_audio
40
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
41
+ if len(audio.shape) > 1:
42
+ audio = librosa.to_mono(audio.transpose(1, 0))
43
+ temp_path = "temp.wav"
44
+ soundfile.write(temp_path, audio, sampling_rate, format="wav")
45
+ return svc_infer(temp_path, vc_transform, auto_f0, f0p)
46
+
47
  def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
48
+ if input_text == '':
49
+ return 0, None
50
  voice = "zh-CN-XiaoyiNeural" if gender == '女' else "zh-CN-YunxiNeural"
51
+ ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
52
+ temp_path = "temp.wav"
 
 
 
 
53
  p = subprocess.Popen("edge-tts " +
54
  " --text " + input_text +
55
+ " --write-media " + temp_path +
56
  " --voice " + voice +
57
  " --rate=" + ratestr, shell=True,
58
  stdout=subprocess.PIPE,
59
  stdin=subprocess.PIPE)
60
  p.wait()
61
+ return svc_infer(temp_path, vc_transform, auto_f0, f0p)
 
 
 
 
 
 
 
 
 
 
62
 
63
  return svc_fn, tts_fn
64
 
 
95
  auto_f0 = gr.Checkbox(label="自动音高预测 (正常说话可选)", value=False)
96
  f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
97
  choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
98
+ svc_submit = gr.Button("生成", variant="primary")
99
+ svc_submit.click(svc_fn, [svc_input, vc_transform, auto_f0, f0_predictor], vc_output)
100
  else:
101
+ tts_input = gr.Textbox(label='说话内容', value='',
102
+ placeholder='请输入说话内容,(已支持无限长内容,处理时间约为说完原内容时间的5倍)')
103
  with gr.Row():
104
  gender = gr.Radio(label='说话人性别', value='男', choices=['男', '女'])
105
  tts_rate = gr.Number(label='语速 (正负百分比)', value=0)
 
109
  f0_predictor = gr.Radio(label="f0预测器 (对电音有影响)",
110
  choices=['crepe', 'harvest', 'dio', 'pm'], value='crepe')
111
  tts_submit = gr.Button("生成", variant="primary")
112
+ tts_submit.click(tts_fn,
113
+ [tts_input, gender, tts_rate, vc_transform, auto_f0, f0_predictor],
114
+ vc_output)
115
 
116
  with gr.Column():
117
  gr.Markdown(
 
120
  '</div>'
121
  )
122
  vc_output = gr.Audio(label="输出音频")
123
+
 
124
  app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)