Katock commited on
Commit
53a560c
·
1 Parent(s): 6ead1f4
Files changed (2) hide show
  1. app.py +11 -19
  2. inference/infer_tool.py +4 -4
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import argparse
 
2
  import logging
3
  import os
4
 
@@ -6,6 +7,7 @@ import gradio as gr
6
  import gradio.processing_utils as gr_processing_utils
7
  import librosa
8
  import numpy as np
 
9
 
10
  from inference.infer_tool import Svc
11
 
@@ -16,6 +18,7 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING)
16
 
17
  limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
18
 
 
19
  # audio_postprocess_ori = gr.Audio.postprocess
20
 
21
 
@@ -37,31 +40,20 @@ def create_vc_fn(model, spk):
37
  duration = audio.shape[0] / sampling_rate
38
  if duration > 20 and limitation:
39
  return "请上传小于20秒的音频,或点击右上角裁剪", None
40
- print("audio1: ", audio)
41
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
42
  if len(audio.shape) > 1:
43
  audio = librosa.to_mono(audio.transpose(1, 0))
44
  if sampling_rate != 16000:
45
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
46
- print("audio2: ", audio)
47
- input_audio = sampling_rate, audio
48
 
49
- # raw_path = io.BytesIO()
50
- # soundfile.write(raw_path, audio, sampling_rate, format="wav")
51
- # raw_path.seek(0)
52
- # raw_audio_path = "tmp.wav",
53
- # soundfile.write(raw_audio_path, audio, sampling_rate, format="wav")
54
- # out_audio, out_sr, _ = model.infer(spk, vc_transform, raw_path,
55
- # auto_predict_f0=auto_f0,
56
- # )
57
- out_audio = model.slice_inference(input_audio=input_audio,
58
- spk=spk,
59
- tran=vc_transform,
60
- slice_db=-40,
61
- f0_predictor=f0p,
62
- cluster_infer_ratio=0,
63
- auto_predict_f0=auto_f0,
64
- noice_scale=0.4)
65
  return "Success", (44100, out_audio.cpu().numpy())
66
 
67
  return vc_fn
 
1
  import argparse
2
+ import io
3
  import logging
4
  import os
5
 
 
7
  import gradio.processing_utils as gr_processing_utils
8
  import librosa
9
  import numpy as np
10
+ import soundfile
11
 
12
  from inference.infer_tool import Svc
13
 
 
18
 
19
  limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
20
 
21
+
22
  # audio_postprocess_ori = gr.Audio.postprocess
23
 
24
 
 
40
  duration = audio.shape[0] / sampling_rate
41
  if duration > 20 and limitation:
42
  return "请上传小于20秒的音频,或点击右上角裁剪", None
 
43
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
44
  if len(audio.shape) > 1:
45
  audio = librosa.to_mono(audio.transpose(1, 0))
46
  if sampling_rate != 16000:
47
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
 
 
48
 
49
+ raw_audio_path = io.BytesIO()
50
+ soundfile.write(raw_audio_path, audio, sampling_rate, format="wav")
51
+ raw_audio_path.seek(0)
52
+ out_audio, _, _ = model.infer(raw_path=raw_audio_path,
53
+ speaker=spk,
54
+ tran=vc_transform,
55
+ f0_predictor=f0p,
56
+ auto_predict_f0=auto_f0)
 
 
 
 
 
 
 
 
57
  return "Success", (44100, out_audio.cpu().numpy())
58
 
59
  return vc_fn
inference/infer_tool.py CHANGED
@@ -359,7 +359,7 @@ class Svc(object):
359
  gc.collect()
360
 
361
  def slice_inference(self,
362
- input_audio,
363
  spk,
364
  tran,
365
  slice_db,
@@ -382,9 +382,9 @@ class Svc(object):
382
  if len(self.spk2id) == 1:
383
  spk = self.spk2id.keys()[0]
384
  use_spk_mix = False
385
- # wav_path = Path(raw_audio_path).with_suffix('.wav')
386
- chunks = slicer.cut(input_audio, db_thresh=slice_db)
387
- audio_data, audio_sr = slicer.chunks2audio(input_audio, chunks)
388
  per_size = int(clip_seconds * audio_sr)
389
  lg_size = int(lg_num * audio_sr)
390
  lg_size_r = int(lg_size * lgr_num)
 
359
  gc.collect()
360
 
361
  def slice_inference(self,
362
+ raw_audio_path,
363
  spk,
364
  tran,
365
  slice_db,
 
382
  if len(self.spk2id) == 1:
383
  spk = self.spk2id.keys()[0]
384
  use_spk_mix = False
385
+ wav_path = Path(raw_audio_path).with_suffix('.wav')
386
+ chunks = slicer.cut(wav_path, db_thresh=slice_db)
387
+ audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
388
  per_size = int(clip_seconds * audio_sr)
389
  lg_size = int(lg_num * audio_sr)
390
  lg_size_r = int(lg_size * lgr_num)