ewebspace commited on
Commit
e37c124
·
verified ·
1 Parent(s): 625c79b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -0
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import warnings
4
+ warnings.filterwarnings("ignore")
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import soundfile as sf
9
+ import librosa
10
+
11
+ from huggingface_hub import snapshot_download
12
+
13
+ # ------------------------------
14
+ # Model bootstrap
15
+ # ------------------------------
16
+ MODEL_DIR = os.path.join(os.getcwd(), "models")
17
+ OPENVOICE_REPO = "myshell-ai/OpenVoiceV2"
18
+
19
+ os.makedirs(MODEL_DIR, exist_ok=True)
20
+
21
+ # Lazy import to speed up Space boot
22
+ _openvoice_loaded = False
23
+ _tone_converter = None
24
+ _content_extractor = None
25
+
26
+ _demucs_model = None
27
+
28
+ def _ensure_openvoice():
29
+ global _openvoice_loaded, _tone_converter, _content_extractor
30
+ if _openvoice_loaded:
31
+ return
32
+ # Download model snapshots into ./models/openvoice
33
+ local_dir = snapshot_download(repo_id=OPENVOICE_REPO, local_dir=os.path.join(MODEL_DIR, "openvoice"), local_dir_use_symlinks=False)
34
+
35
+ # OpenVoice v2 layout ships python modules; import after download
36
+ import sys
37
+ if local_dir not in sys.path:
38
+ sys.path.append(local_dir)
39
+
40
+ # Import OpenVoice components
41
+ try:
42
+ from openvoice import se_extractor
43
+ from openvoice.api import ToneColorConverter, ContentVec
44
+ except Exception:
45
+ # Fallback to module paths used in some snapshots
46
+ from tone_color_converter.api import ToneColorConverter
47
+ from contentvec.api import ContentVec
48
+ from se_extractor import se_extractor
49
+
50
+ # Init content extractor (HuBERT-like)
51
+ content_ckpt = os.path.join(local_dir, "checkpoints", "contentvec", "checkpoint.pth")
52
+ _content_extractor = ContentVec(content_ckpt)
53
+
54
+ # Init tone color converter
55
+ tcc_ckpt = os.path.join(local_dir, "checkpoints", "tone_color_converter", "checkpoint.pth")
56
+ _tone_converter = ToneColorConverter(tcc_ckpt, device=os.environ.get("DEVICE", "cuda" if gr.cuda.is_available() else "cpu"))
57
+
58
+ _openvoice_loaded = True
59
+
60
+
61
+ def _ensure_demucs():
62
+ global _demucs_model
63
+ if _demucs_model is not None:
64
+ return
65
+ from demucs.apply import apply_model
66
+ from demucs.pretrained import get_model
67
+ from demucs.audio import AudioFile
68
+ _demucs_model = {
69
+ "apply_model": apply_model,
70
+ "get_model": get_model,
71
+ "AudioFile": AudioFile,
72
+ }
73
+
74
+
75
+ def separate_vocals(wav_path, stem="vocals"):
76
+ """Return path to separated vocals and accompaniment using htdemucs."""
77
+ _ensure_demucs()
78
+ apply_model = _demucs_model["apply_model"]
79
+ get_model = _demucs_model["get_model"]
80
+ AudioFile = _demucs_model["AudioFile"]
81
+
82
+ model = get_model(name="htdemucs")
83
+ model.cpu()
84
+
85
+ with AudioFile(wav_path).read(streams=0, samplerate=44100, channels=2) as mix:
86
+ ref = mix
87
+ out = apply_model(model, ref, shifts=1, split=True, overlap=0.25)
88
+ sources = {name: out[idx] for idx, name in enumerate(model.sources)}
89
+
90
+ # Save stems
91
+ base = os.path.splitext(os.path.basename(wav_path))[0]
92
+ out_dir = tempfile.mkdtemp(prefix="stems_")
93
+ vocal_path = os.path.join(out_dir, f"{base}_vocals.wav")
94
+ inst_path = os.path.join(out_dir, f"{base}_inst.wav")
95
+
96
+ sf.write(vocal_path, sources["vocals"].T, 44100)
97
+ # Combine other stems for instrumental
98
+ inst = sum([v for k, v in sources.items() if k != "vocals"]) / (len(model.sources) - 1)
99
+ sf.write(inst_path, inst.T, 44100)
100
+ return vocal_path, inst_path
101
+
102
+
103
+ def load_audio(x, sr=44100, mono=True):
104
+ y, _sr = librosa.load(x, sr=sr, mono=mono)
105
+ return y, sr
106
+
107
+
108
+ def save_audio(y, sr):
109
+ path = tempfile.mktemp(suffix=".wav")
110
+ sf.write(path, y, sr)
111
+ return path
112
+
113
+
114
+ def match_length(a, b):
115
+ # Pad/trim a to match length of b
116
+ if len(a) < len(b):
117
+ a = np.pad(a, (0, len(b)-len(a)))
118
+ else:
119
+ a = a[:len(b)]
120
+ return a
121
+
122
+
123
+ def convert_voice(reference_wav, source_vocal_wav, style_strength=0.8, pitch_shift=0.0, formant_shift=0.0):
124
+ _ensure_openvoice()
125
+
126
+ # Load audio
127
+ ref, sr = load_audio(reference_wav, sr=16000, mono=True)
128
+ src, _ = load_audio(source_vocal_wav, sr=16000, mono=True)
129
+
130
+ # Extract content features from source
131
+ content = _content_extractor.extract(src, sr)
132
+
133
+ # Extract speaker embedding / tone color from reference
134
+ # OpenVoice ships an SE (speaker encoder) util; we mimic via API if exposed.
135
+ try:
136
+ from openvoice import se_extractor
137
+ se = se_extractor.get_se(reference_wav, device=_tone_converter.device)
138
+ except Exception:
139
+ # Some snapshots provide a function name get_se_wav
140
+ from se_extractor import get_se
141
+ se = get_se(reference_wav)
142
+
143
+ # Run tone color conversion
144
+ converted = _tone_converter.convert(content, se, style_strength=style_strength)
145
+
146
+ y = converted
147
+
148
+ # Optional pitch & formant adjustments (light touch)
149
+ if abs(pitch_shift) > 1e-3:
150
+ y = librosa.effects.pitch_shift(y.astype(np.float32), 16000, n_steps=pitch_shift)
151
+ if abs(formant_shift) > 1e-3:
152
+ # crude formant-esque EQ tilt using shelving filter via librosa
153
+ import scipy.signal as sps
154
+ w = 2 * np.pi * 1500 / 16000
155
+ b, a = sps.iirfilter(2, Wn=w/np.pi, btype='high', ftype='but