Spaces:

GroveStreet
/

GTA_SOVITS

Sleeping

Katock commited on Jan 2, 2024

Commit

605d345

1 Parent(s): 29efbb8

修复上传干声采样率不齐导致的语速问题

Files changed (1) hide show

app.py CHANGED Viewed

@@ -41,10 +41,11 @@ def create_fn(model, spk):
             return 0, None
         sr, audio = input_audio
         audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
         if len(audio.shape) > 1:
             audio = librosa.to_mono(audio.transpose(1, 0))
         temp_path = "temp.wav"
-        soundfile.write(temp_path, audio, sr, format="wav")
         model.hubert_model = hubert_dict[model.speech_encoder]
         out_audio = model.slice_inference(raw_audio_path=temp_path,
@@ -58,7 +59,7 @@ def create_fn(model, spk):
                                           auto_predict_f0=auto_f0)
         model.clear_empty()
         os.remove(temp_path)
-        return sr, out_audio
     async def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
         if input_text == '':
@@ -79,7 +80,7 @@ def create_fn(model, spk):
         temp_path = "temp.wav"
         wavfile.write(temp_path, sampling_rate, (audio * np.iinfo(np.int16).max).astype(np.int16))
         sr, audio = gr_pu.audio_from_file(temp_path)
-        input_audio = (sr, audio)
         return svc_fn(input_audio, vc_transform, auto_f0, f0p)
     return svc_fn, tts_fn

             return 0, None
         sr, audio = input_audio
         audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
+        audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
         if len(audio.shape) > 1:
             audio = librosa.to_mono(audio.transpose(1, 0))
         temp_path = "temp.wav"
+        soundfile.write(temp_path, audio, sampling_rate, format="wav")
         model.hubert_model = hubert_dict[model.speech_encoder]
         out_audio = model.slice_inference(raw_audio_path=temp_path,
                                           auto_predict_f0=auto_f0)
         model.clear_empty()
         os.remove(temp_path)
+        return sampling_rate, out_audio
     async def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
         if input_text == '':
         temp_path = "temp.wav"
         wavfile.write(temp_path, sampling_rate, (audio * np.iinfo(np.int16).max).astype(np.int16))
         sr, audio = gr_pu.audio_from_file(temp_path)
+        input_audio = (sampling_rate, audio)
         return svc_fn(input_audio, vc_transform, auto_f0, f0p)
     return svc_fn, tts_fn