Spaces:
Sleeping
Sleeping
Katock
commited on
Commit
·
53a560c
1
Parent(s):
6ead1f4
debug
Browse files- app.py +11 -19
- inference/infer_tool.py +4 -4
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import argparse
|
|
|
2 |
import logging
|
3 |
import os
|
4 |
|
@@ -6,6 +7,7 @@ import gradio as gr
|
|
6 |
import gradio.processing_utils as gr_processing_utils
|
7 |
import librosa
|
8 |
import numpy as np
|
|
|
9 |
|
10 |
from inference.infer_tool import Svc
|
11 |
|
@@ -16,6 +18,7 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
|
16 |
|
17 |
limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
|
18 |
|
|
|
19 |
# audio_postprocess_ori = gr.Audio.postprocess
|
20 |
|
21 |
|
@@ -37,31 +40,20 @@ def create_vc_fn(model, spk):
|
|
37 |
duration = audio.shape[0] / sampling_rate
|
38 |
if duration > 20 and limitation:
|
39 |
return "请上传小于20秒的音频,或点击右上角裁剪", None
|
40 |
-
print("audio1: ", audio)
|
41 |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
42 |
if len(audio.shape) > 1:
|
43 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
44 |
if sampling_rate != 16000:
|
45 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
46 |
-
print("audio2: ", audio)
|
47 |
-
input_audio = sampling_rate, audio
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
out_audio = model.slice_inference(input_audio=input_audio,
|
58 |
-
spk=spk,
|
59 |
-
tran=vc_transform,
|
60 |
-
slice_db=-40,
|
61 |
-
f0_predictor=f0p,
|
62 |
-
cluster_infer_ratio=0,
|
63 |
-
auto_predict_f0=auto_f0,
|
64 |
-
noice_scale=0.4)
|
65 |
return "Success", (44100, out_audio.cpu().numpy())
|
66 |
|
67 |
return vc_fn
|
|
|
1 |
import argparse
|
2 |
+
import io
|
3 |
import logging
|
4 |
import os
|
5 |
|
|
|
7 |
import gradio.processing_utils as gr_processing_utils
|
8 |
import librosa
|
9 |
import numpy as np
|
10 |
+
import soundfile
|
11 |
|
12 |
from inference.infer_tool import Svc
|
13 |
|
|
|
18 |
|
19 |
limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
|
20 |
|
21 |
+
|
22 |
# audio_postprocess_ori = gr.Audio.postprocess
|
23 |
|
24 |
|
|
|
40 |
duration = audio.shape[0] / sampling_rate
|
41 |
if duration > 20 and limitation:
|
42 |
return "请上传小于20秒的音频,或点击右上角裁剪", None
|
|
|
43 |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
44 |
if len(audio.shape) > 1:
|
45 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
46 |
if sampling_rate != 16000:
|
47 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
|
|
|
|
48 |
|
49 |
+
raw_audio_path = io.BytesIO()
|
50 |
+
soundfile.write(raw_audio_path, audio, sampling_rate, format="wav")
|
51 |
+
raw_audio_path.seek(0)
|
52 |
+
out_audio, _, _ = model.infer(raw_path=raw_audio_path,
|
53 |
+
speaker=spk,
|
54 |
+
tran=vc_transform,
|
55 |
+
f0_predictor=f0p,
|
56 |
+
auto_predict_f0=auto_f0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
return "Success", (44100, out_audio.cpu().numpy())
|
58 |
|
59 |
return vc_fn
|
inference/infer_tool.py
CHANGED
@@ -359,7 +359,7 @@ class Svc(object):
|
|
359 |
gc.collect()
|
360 |
|
361 |
def slice_inference(self,
|
362 |
-
|
363 |
spk,
|
364 |
tran,
|
365 |
slice_db,
|
@@ -382,9 +382,9 @@ class Svc(object):
|
|
382 |
if len(self.spk2id) == 1:
|
383 |
spk = self.spk2id.keys()[0]
|
384 |
use_spk_mix = False
|
385 |
-
|
386 |
-
chunks = slicer.cut(
|
387 |
-
audio_data, audio_sr = slicer.chunks2audio(
|
388 |
per_size = int(clip_seconds * audio_sr)
|
389 |
lg_size = int(lg_num * audio_sr)
|
390 |
lg_size_r = int(lg_size * lgr_num)
|
|
|
359 |
gc.collect()
|
360 |
|
361 |
def slice_inference(self,
|
362 |
+
raw_audio_path,
|
363 |
spk,
|
364 |
tran,
|
365 |
slice_db,
|
|
|
382 |
if len(self.spk2id) == 1:
|
383 |
spk = self.spk2id.keys()[0]
|
384 |
use_spk_mix = False
|
385 |
+
wav_path = Path(raw_audio_path).with_suffix('.wav')
|
386 |
+
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
387 |
+
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
|
388 |
per_size = int(clip_seconds * audio_sr)
|
389 |
lg_size = int(lg_num * audio_sr)
|
390 |
lg_size_r = int(lg_size * lgr_num)
|