Katock commited on
Commit
605d345
·
1 Parent(s): 29efbb8

修复上传干声采样率不齐导致的语速问题

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -41,10 +41,11 @@ def create_fn(model, spk):
41
  return 0, None
42
  sr, audio = input_audio
43
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
 
44
  if len(audio.shape) > 1:
45
  audio = librosa.to_mono(audio.transpose(1, 0))
46
  temp_path = "temp.wav"
47
- soundfile.write(temp_path, audio, sr, format="wav")
48
 
49
  model.hubert_model = hubert_dict[model.speech_encoder]
50
  out_audio = model.slice_inference(raw_audio_path=temp_path,
@@ -58,7 +59,7 @@ def create_fn(model, spk):
58
  auto_predict_f0=auto_f0)
59
  model.clear_empty()
60
  os.remove(temp_path)
61
- return sr, out_audio
62
 
63
  async def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
64
  if input_text == '':
@@ -79,7 +80,7 @@ def create_fn(model, spk):
79
  temp_path = "temp.wav"
80
  wavfile.write(temp_path, sampling_rate, (audio * np.iinfo(np.int16).max).astype(np.int16))
81
  sr, audio = gr_pu.audio_from_file(temp_path)
82
- input_audio = (sr, audio)
83
  return svc_fn(input_audio, vc_transform, auto_f0, f0p)
84
 
85
  return svc_fn, tts_fn
 
41
  return 0, None
42
  sr, audio = input_audio
43
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
44
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
45
  if len(audio.shape) > 1:
46
  audio = librosa.to_mono(audio.transpose(1, 0))
47
  temp_path = "temp.wav"
48
+ soundfile.write(temp_path, audio, sampling_rate, format="wav")
49
 
50
  model.hubert_model = hubert_dict[model.speech_encoder]
51
  out_audio = model.slice_inference(raw_audio_path=temp_path,
 
59
  auto_predict_f0=auto_f0)
60
  model.clear_empty()
61
  os.remove(temp_path)
62
+ return sampling_rate, out_audio
63
 
64
  async def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
65
  if input_text == '':
 
80
  temp_path = "temp.wav"
81
  wavfile.write(temp_path, sampling_rate, (audio * np.iinfo(np.int16).max).astype(np.int16))
82
  sr, audio = gr_pu.audio_from_file(temp_path)
83
+ input_audio = (sampling_rate, audio)
84
  return svc_fn(input_audio, vc_transform, auto_f0, f0p)
85
 
86
  return svc_fn, tts_fn