Spaces:
Sleeping
Sleeping
Staticaliza
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -35,9 +35,10 @@ torch.backends.cuda.enabled = False
|
|
35 |
|
36 |
torch.set_grad_enabled(False)
|
37 |
|
38 |
-
# Force CPU usage
|
|
|
39 |
device = torch.device("cpu")
|
40 |
-
print(f"[DEVICE] | Using device: {device}")
|
41 |
|
42 |
# ----------------------------
|
43 |
# Load Models and Configuration
|
@@ -53,7 +54,11 @@ def load_custom_model_from_hf(repo_id, model_filename="pytorch_model.bin", confi
|
|
53 |
return model_path, config_path
|
54 |
|
55 |
# Load DiT model
|
56 |
-
dit_checkpoint_path, dit_config_path = load_custom_model_from_hf(
|
|
|
|
|
|
|
|
|
57 |
config = yaml.safe_load(open(dit_config_path, 'r'))
|
58 |
model_params = recursive_munch(config['model_params'])
|
59 |
model = build_model(model_params, stage='DiT')
|
@@ -67,9 +72,8 @@ sr = config['preprocess_params']['sr']
|
|
67 |
# Load DiT checkpoints
|
68 |
model, _, _, _ = load_checkpoint(model, None, dit_checkpoint_path, load_only_params=True, ignore_modules=[], is_distributed=False)
|
69 |
for key in model:
|
70 |
-
model[key].eval()
|
71 |
-
|
72 |
-
print("[INFO] | DiT model loaded and set to eval mode.")
|
73 |
|
74 |
model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
|
75 |
|
@@ -81,34 +85,32 @@ except NameError:
|
|
81 |
print("[ERROR] | CAMPPlus is not defined. Please check the import path and ensure CAMPPlus is correctly defined.")
|
82 |
raise
|
83 |
|
84 |
-
# Set weights_only=True for security
|
85 |
campplus_ckpt_path = load_custom_model_from_hf("funasr/campplus", "campplus_cn_common.bin", config_filename=None)
|
86 |
-
campplus_state = torch.load(campplus_ckpt_path, map_location="cpu"
|
87 |
campplus_model.load_state_dict(campplus_state)
|
88 |
-
campplus_model.eval()
|
89 |
-
|
90 |
-
print("[INFO] | CAMPPlus model loaded, set to eval mode, and moved to CPU.")
|
91 |
|
92 |
# Load BigVGAN model
|
93 |
bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
|
94 |
bigvgan_model.remove_weight_norm()
|
95 |
-
bigvgan_model = bigvgan_model.eval().to(device)
|
96 |
-
print("[INFO] | BigVGAN model loaded, weight norm removed, set to eval mode, and
|
97 |
|
98 |
# Load FAcodec model
|
99 |
ckpt_path, config_path = load_custom_model_from_hf("Plachta/FAcodec", 'pytorch_model.bin', 'config.yml')
|
100 |
codec_config = yaml.safe_load(open(config_path))
|
101 |
codec_model_params = recursive_munch(codec_config['model_params'])
|
102 |
codec_encoder = build_model(codec_model_params, stage="codec")
|
103 |
-
ckpt_params = torch.load(ckpt_path, map_location="cpu"
|
104 |
for key in codec_encoder:
|
105 |
codec_encoder[key].load_state_dict(ckpt_params[key], strict=False)
|
106 |
-
codec_encoder = {k: v.eval().to(device) for k, v in codec_encoder.items()}
|
107 |
-
print("[INFO] | FAcodec model loaded, set to eval mode, and
|
108 |
|
109 |
-
# Load Whisper model with
|
110 |
whisper_name = model_params.speech_tokenizer.whisper_name if hasattr(model_params.speech_tokenizer, 'whisper_name') else "openai/whisper-small"
|
111 |
-
whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.
|
112 |
del whisper_model.decoder # Remove decoder as it's not used
|
113 |
whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
|
114 |
print(f"[INFO] | Whisper model '{whisper_name}' loaded with dtype {whisper_model.dtype} and moved to CPU.")
|
@@ -127,7 +129,11 @@ mel_fn_args = {
|
|
127 |
to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
|
128 |
|
129 |
# Load F0 conditioned model
|
130 |
-
dit_checkpoint_path_f0, dit_config_path_f0 = load_custom_model_from_hf(
|
|
|
|
|
|
|
|
|
131 |
config_f0 = yaml.safe_load(open(dit_config_path_f0, 'r'))
|
132 |
model_params_f0 = recursive_munch(config_f0['model_params'])
|
133 |
model_f0 = build_model(model_params_f0, stage='DiT')
|
@@ -138,16 +144,15 @@ sr_f0 = config_f0['preprocess_params']['sr']
|
|
138 |
# Load F0 model checkpoints
|
139 |
model_f0, _, _, _ = load_checkpoint(model_f0, None, dit_checkpoint_path_f0, load_only_params=True, ignore_modules=[], is_distributed=False)
|
140 |
for key in model_f0:
|
141 |
-
model_f0[key].eval()
|
142 |
-
model_f0[key].to(device)
|
143 |
print("[INFO] | F0 conditioned DiT model loaded and set to eval mode.")
|
144 |
|
145 |
model_f0.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
|
146 |
|
147 |
# Load F0 extractor
|
148 |
model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
|
149 |
-
rmvpe = RMVPE(model_path, is_half=
|
150 |
-
print("[INFO] | RMVPE model loaded and
|
151 |
|
152 |
mel_fn_args_f0 = {
|
153 |
"n_fft": config_f0['preprocess_params']['spect_params']['n_fft'],
|
@@ -164,8 +169,8 @@ to_mel_f0 = lambda x: mel_spectrogram(x, **mel_fn_args_f0)
|
|
164 |
# Load BigVGAN 44kHz model
|
165 |
bigvgan_44k_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_44khz_128band_512x', use_cuda_kernel=False)
|
166 |
bigvgan_44k_model.remove_weight_norm()
|
167 |
-
bigvgan_44k_model = bigvgan_44k_model.eval().to(device)
|
168 |
-
print("[INFO] | BigVGAN 44kHz model loaded, weight norm removed, set to eval mode, and
|
169 |
|
170 |
# CSS Styling
|
171 |
css = '''
|
@@ -182,274 +187,69 @@ footer {
|
|
182 |
|
183 |
@torch.no_grad()
|
184 |
@torch.inference_mode()
|
185 |
-
def voice_conversion(input, reference, steps, guidance,
|
186 |
-
|
187 |
-
|
188 |
-
inference_module, mel_fn, bigvgan_fn = model, to_mel, bigvgan_model
|
189 |
-
bitrate, sampling_rate, sr_current, hop_length_current = "320k", 16000, 22050, 256
|
190 |
-
max_context_window, overlap_wave_len = sr_current // hop_length_current * 30, 16 * hop_length_current
|
191 |
-
|
192 |
-
# Load audio using librosa
|
193 |
-
print("[INFO] | Loading source and reference audio.")
|
194 |
source_audio, _ = librosa.load(input, sr=sr_current)
|
195 |
ref_audio, _ = librosa.load(reference, sr=sr_current)
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
ref_audio_tensor = torch.tensor(ref_audio).unsqueeze(0).float().to(device)
|
204 |
-
|
205 |
-
# Resample to 16kHz
|
206 |
-
ref_waves_16k = torchaudio.functional.resample(ref_audio_tensor, sr_current, sampling_rate).to(device)
|
207 |
-
converted_waves_16k = torchaudio.functional.resample(source_audio_tensor, sr_current, sampling_rate).to(device)
|
208 |
-
|
209 |
-
# Generate Whisper features for source audio
|
210 |
-
print("[INFO] | Generating Whisper features for source audio.")
|
211 |
-
if converted_waves_16k.size(-1) <= sampling_rate * 30:
|
212 |
-
alt_inputs = whisper_feature_extractor(
|
213 |
-
[converted_waves_16k.squeeze(0).cpu().numpy()],
|
214 |
-
return_tensors="pt",
|
215 |
-
return_attention_mask=True,
|
216 |
-
sampling_rate=sampling_rate
|
217 |
-
)
|
218 |
-
alt_input_features = whisper_model._mask_input_features(
|
219 |
-
alt_inputs.input_features,
|
220 |
-
attention_mask=alt_inputs.attention_mask
|
221 |
-
).to(device)
|
222 |
-
alt_outputs = whisper_model.encoder(
|
223 |
-
alt_input_features.to(torch.float32),
|
224 |
-
head_mask=None,
|
225 |
-
output_attentions=False,
|
226 |
-
output_hidden_states=False,
|
227 |
-
return_dict=True
|
228 |
-
)
|
229 |
-
S_alt = alt_outputs.last_hidden_state.to(torch.float32)
|
230 |
-
S_alt = S_alt[:, :converted_waves_16k.size(-1) // 320 + 1]
|
231 |
-
print(f"[INFO] | S_alt shape: {S_alt.shape}")
|
232 |
-
else:
|
233 |
-
# Process in chunks
|
234 |
-
print("[INFO] | Processing source audio in chunks.")
|
235 |
-
overlapping_time = 5 # seconds
|
236 |
-
chunk_size = sampling_rate * 30 # 30 seconds
|
237 |
-
overlap_size = sampling_rate * overlapping_time
|
238 |
-
S_alt_list = []
|
239 |
-
buffer = None
|
240 |
-
traversed_time = 0
|
241 |
-
total_length = converted_waves_16k.size(-1)
|
242 |
-
|
243 |
-
while traversed_time < total_length:
|
244 |
-
end_time = traversed_time + chunk_size
|
245 |
-
if end_time > total_length:
|
246 |
-
end_time = total_length
|
247 |
-
chunk = converted_waves_16k[:, traversed_time:end_time]
|
248 |
-
if buffer is not None:
|
249 |
-
chunk = torch.cat([buffer, chunk], dim=-1)
|
250 |
-
alt_inputs = whisper_feature_extractor(
|
251 |
-
[chunk.squeeze(0).cpu().numpy()],
|
252 |
-
return_tensors="pt",
|
253 |
-
return_attention_mask=True,
|
254 |
-
sampling_rate=sampling_rate
|
255 |
-
)
|
256 |
-
alt_input_features = whisper_model._mask_input_features(
|
257 |
-
alt_inputs.input_features,
|
258 |
-
attention_mask=alt_inputs.attention_mask
|
259 |
-
).to(device)
|
260 |
-
alt_outputs = whisper_model.encoder(
|
261 |
-
alt_input_features.to(torch.float32),
|
262 |
-
head_mask=None,
|
263 |
-
output_attentions=False,
|
264 |
-
output_hidden_states=False,
|
265 |
-
return_dict=True
|
266 |
-
)
|
267 |
-
S_chunk = alt_outputs.last_hidden_state.to(torch.float32)
|
268 |
-
S_chunk = S_chunk[:, :chunk.size(-1) // 320 + 1]
|
269 |
-
print(f"[INFO] | Processed chunk with S_chunk shape: {S_chunk.shape}")
|
270 |
-
|
271 |
-
if traversed_time == 0:
|
272 |
-
S_alt_list.append(S_chunk)
|
273 |
-
else:
|
274 |
-
skip_frames = 50 * overlapping_time
|
275 |
-
S_alt_list.append(S_chunk[:, skip_frames:])
|
276 |
-
|
277 |
-
buffer = chunk[:, -overlap_size:]
|
278 |
-
traversed_time += chunk_size - overlap_size
|
279 |
-
|
280 |
-
S_alt = torch.cat(S_alt_list, dim=1)
|
281 |
-
print(f"[INFO] | Final S_alt shape after chunk processing: {S_alt.shape}")
|
282 |
-
|
283 |
-
# Generate Whisper features for reference audio
|
284 |
-
print("[INFO] | Generating Whisper features for reference audio.")
|
285 |
-
ori_waves_16k = torchaudio.functional.resample(ref_audio_tensor, sr_current, sampling_rate).to(device)
|
286 |
-
ori_inputs = whisper_feature_extractor(
|
287 |
-
[ori_waves_16k.squeeze(0).cpu().numpy()],
|
288 |
-
return_tensors="pt",
|
289 |
-
return_attention_mask=True,
|
290 |
sampling_rate=sampling_rate
|
291 |
)
|
292 |
-
|
293 |
-
|
294 |
-
attention_mask=
|
295 |
).to(device)
|
296 |
-
|
297 |
-
|
298 |
-
head_mask=None,
|
299 |
-
output_attentions=False,
|
300 |
-
output_hidden_states=False,
|
301 |
-
return_dict=True
|
302 |
-
)
|
303 |
-
S_ori = ori_outputs.last_hidden_state.to(torch.float32)
|
304 |
-
S_ori = S_ori[:, :ori_waves_16k.size(-1) // 320 + 1]
|
305 |
-
print(f"[INFO] | S_ori shape: {S_ori.shape}")
|
306 |
-
|
307 |
# Generate mel spectrograms
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
print(f"[INFO] | Mel spectrogram shapes: mel={mel.shape}, mel2={mel2.shape}")
|
312 |
-
|
313 |
-
# Length adjustment
|
314 |
-
target_lengths = torch.LongTensor([int(mel.size(2) * speed)]).to(mel.device)
|
315 |
-
target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
|
316 |
-
print(f"[INFO] | Target lengths: {target_lengths.item()}, {target2_lengths.item()}")
|
317 |
-
|
318 |
# Extract style features
|
319 |
-
print("[INFO] | Extracting style features from reference audio.")
|
320 |
feat2 = torchaudio.compliance.kaldi.fbank(
|
321 |
-
|
322 |
-
num_mel_bins=80,
|
323 |
-
dither=0,
|
324 |
-
sample_frequency=sampling_rate
|
325 |
)
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
# Length Regulation
|
331 |
-
print("[INFO] | Applying length regulation.")
|
332 |
cond, _, _, _, _ = inference_module.length_regulator(
|
333 |
-
|
334 |
)
|
335 |
prompt_condition, _, _, _, _ = inference_module.length_regulator(
|
336 |
-
|
337 |
)
|
338 |
-
|
339 |
-
|
340 |
-
# Initialize variables for audio generation
|
341 |
-
max_source_window = max_context_window - mel2.size(2)
|
342 |
-
processed_frames = 0
|
343 |
generated_wave_chunks = []
|
344 |
-
|
345 |
-
print("[INFO] | Starting inference and audio generation.")
|
346 |
-
|
347 |
while processed_frames < cond.size(1):
|
348 |
chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
|
349 |
-
|
350 |
-
cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
|
351 |
|
352 |
-
# Perform inference
|
353 |
vc_target = inference_module.cfm.inference(
|
354 |
-
cat_condition,
|
355 |
-
torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
|
356 |
-
mel2,
|
357 |
-
style2,
|
358 |
-
None,
|
359 |
-
steps,
|
360 |
-
inference_cfg_rate=guidance
|
361 |
)
|
362 |
-
vc_target = vc_target[:, :, mel2.size(2):]
|
363 |
-
print(f"[INFO] | vc_target shape: {vc_target.shape}")
|
364 |
-
|
365 |
-
# Generate waveform using BigVGAN
|
366 |
-
vc_wave = bigvgan_fn(vc_target.float())[0]
|
367 |
-
print(f"[INFO] | vc_wave shape: {vc_wave.shape}")
|
368 |
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
final_audio = final_audio / max_val
|
384 |
-
print("[INFO] | Final audio normalized.")
|
385 |
-
|
386 |
-
# ----------------------------
|
387 |
-
# Audio Processing: Noise Reduction and Pitch Shifting
|
388 |
-
# ----------------------------
|
389 |
-
|
390 |
-
# Noise Reduction using noisereduce
|
391 |
-
print("[INFO] | Applying noise reduction.")
|
392 |
-
try:
|
393 |
-
# Option 1: Using a Noise Sample (first 0.5 seconds)
|
394 |
-
noise_duration = 0.5 # seconds
|
395 |
-
noise_sample = final_audio[:int(noise_duration * sr_current)]
|
396 |
-
final_audio = nr.reduce_noise(
|
397 |
-
y=final_audio,
|
398 |
-
sr=sr_current,
|
399 |
-
y_noise=noise_sample,
|
400 |
-
prop_decrease=1.0
|
401 |
-
)
|
402 |
-
print("[INFO] | Noise reduction applied using a noise sample.")
|
403 |
-
except Exception as e:
|
404 |
-
print(f"[ERROR] | Noise reduction with noise sample failed: {e}")
|
405 |
-
# Option 2: Automatic Noise Estimation
|
406 |
-
try:
|
407 |
-
final_audio = nr.reduce_noise(
|
408 |
-
y=final_audio,
|
409 |
-
sr=sr_current,
|
410 |
-
stationary=False
|
411 |
-
)
|
412 |
-
print("[INFO] | Noise reduction applied with automatic noise estimation.")
|
413 |
-
except Exception as e:
|
414 |
-
print(f"[ERROR] | Noise reduction with automatic estimation failed: {e}")
|
415 |
-
|
416 |
-
# Pitch Shifting using librosa
|
417 |
-
print("[INFO] | Applying pitch shifting.")
|
418 |
-
try:
|
419 |
-
if pitch != 0:
|
420 |
-
final_audio = librosa.effects.pitch_shift(
|
421 |
-
final_audio,
|
422 |
-
sr=sr_current,
|
423 |
-
n_steps=pitch
|
424 |
-
)
|
425 |
-
print(f"[INFO] | Pitch shifted by {pitch} semitones.")
|
426 |
-
else:
|
427 |
-
print("[INFO] | No pitch shift applied.")
|
428 |
-
except Exception as e:
|
429 |
-
print(f"[ERROR] | Pitch shifting failed: {e}")
|
430 |
-
|
431 |
-
# Optional: Further Normalization after Pitch Shifting
|
432 |
-
max_val = np.max(np.abs(final_audio))
|
433 |
-
if max_val > 1.0:
|
434 |
-
final_audio = final_audio / max_val
|
435 |
-
print("[INFO] | Final audio normalized after pitch shifting.")
|
436 |
-
|
437 |
-
# ----------------------------
|
438 |
-
# Save the Audio
|
439 |
-
# ----------------------------
|
440 |
-
|
441 |
-
# Save the audio to a temporary WAV file
|
442 |
-
print("[INFO] | Saving final audio to a temporary WAV file.")
|
443 |
-
try:
|
444 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
|
445 |
-
sf.write(tmp_file.name, final_audio, sr_current, format='WAV')
|
446 |
-
temp_file_path = tmp_file.name
|
447 |
-
print(f"[INFO] | Final audio saved to {temp_file_path}")
|
448 |
-
except Exception as e:
|
449 |
-
print(f"[ERROR] | Saving audio failed: {e}")
|
450 |
-
return None
|
451 |
-
|
452 |
-
return temp_file_path
|
453 |
|
454 |
def cloud():
|
455 |
print("[CLOUD] | Space maintained.")
|
|
|
35 |
|
36 |
torch.set_grad_enabled(False)
|
37 |
|
38 |
+
# Force CPU usage and set default dtype to float16
|
39 |
+
torch.set_default_dtype(torch.float16)
|
40 |
device = torch.device("cpu")
|
41 |
+
print(f"[DEVICE] | Using device: {device} with dtype {torch.get_default_dtype()}")
|
42 |
|
43 |
# ----------------------------
|
44 |
# Load Models and Configuration
|
|
|
54 |
return model_path, config_path
|
55 |
|
56 |
# Load DiT model
|
57 |
+
dit_checkpoint_path, dit_config_path = load_custom_model_from_hf(
|
58 |
+
"Plachta/Seed-VC",
|
59 |
+
"DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
|
60 |
+
"config_dit_mel_seed_uvit_whisper_small_wavenet.yml"
|
61 |
+
)
|
62 |
config = yaml.safe_load(open(dit_config_path, 'r'))
|
63 |
model_params = recursive_munch(config['model_params'])
|
64 |
model = build_model(model_params, stage='DiT')
|
|
|
72 |
# Load DiT checkpoints
|
73 |
model, _, _, _ = load_checkpoint(model, None, dit_checkpoint_path, load_only_params=True, ignore_modules=[], is_distributed=False)
|
74 |
for key in model:
|
75 |
+
model[key] = model[key].eval().to(device).half()
|
76 |
+
print("[INFO] | DiT model loaded, set to eval mode, and converted to float16.")
|
|
|
77 |
|
78 |
model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
|
79 |
|
|
|
85 |
print("[ERROR] | CAMPPlus is not defined. Please check the import path and ensure CAMPPlus is correctly defined.")
|
86 |
raise
|
87 |
|
|
|
88 |
campplus_ckpt_path = load_custom_model_from_hf("funasr/campplus", "campplus_cn_common.bin", config_filename=None)
|
89 |
+
campplus_state = torch.load(campplus_ckpt_path, map_location="cpu")
|
90 |
campplus_model.load_state_dict(campplus_state)
|
91 |
+
campplus_model = campplus_model.eval().to(device).half()
|
92 |
+
print("[INFO] | CAMPPlus model loaded, set to eval mode, and converted to float16.")
|
|
|
93 |
|
94 |
# Load BigVGAN model
|
95 |
bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
|
96 |
bigvgan_model.remove_weight_norm()
|
97 |
+
bigvgan_model = bigvgan_model.eval().to(device).half()
|
98 |
+
print("[INFO] | BigVGAN model loaded, weight norm removed, set to eval mode, and converted to float16.")
|
99 |
|
100 |
# Load FAcodec model
|
101 |
ckpt_path, config_path = load_custom_model_from_hf("Plachta/FAcodec", 'pytorch_model.bin', 'config.yml')
|
102 |
codec_config = yaml.safe_load(open(config_path))
|
103 |
codec_model_params = recursive_munch(codec_config['model_params'])
|
104 |
codec_encoder = build_model(codec_model_params, stage="codec")
|
105 |
+
ckpt_params = torch.load(ckpt_path, map_location="cpu")
|
106 |
for key in codec_encoder:
|
107 |
codec_encoder[key].load_state_dict(ckpt_params[key], strict=False)
|
108 |
+
codec_encoder = {k: v.eval().to(device).half() for k, v in codec_encoder.items()}
|
109 |
+
print("[INFO] | FAcodec model loaded, set to eval mode, and converted to float16.")
|
110 |
|
111 |
+
# Load Whisper model with float16 and compatible size
|
112 |
whisper_name = model_params.speech_tokenizer.whisper_name if hasattr(model_params.speech_tokenizer, 'whisper_name') else "openai/whisper-small"
|
113 |
+
whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float16).to(device)
|
114 |
del whisper_model.decoder # Remove decoder as it's not used
|
115 |
whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
|
116 |
print(f"[INFO] | Whisper model '{whisper_name}' loaded with dtype {whisper_model.dtype} and moved to CPU.")
|
|
|
129 |
to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
|
130 |
|
131 |
# Load F0 conditioned model
|
132 |
+
dit_checkpoint_path_f0, dit_config_path_f0 = load_custom_model_from_hf(
|
133 |
+
"Plachta/Seed-VC",
|
134 |
+
"DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth",
|
135 |
+
"config_dit_mel_seed_uvit_whisper_base_f0_44k.yml"
|
136 |
+
)
|
137 |
config_f0 = yaml.safe_load(open(dit_config_path_f0, 'r'))
|
138 |
model_params_f0 = recursive_munch(config_f0['model_params'])
|
139 |
model_f0 = build_model(model_params_f0, stage='DiT')
|
|
|
144 |
# Load F0 model checkpoints
|
145 |
model_f0, _, _, _ = load_checkpoint(model_f0, None, dit_checkpoint_path_f0, load_only_params=True, ignore_modules=[], is_distributed=False)
|
146 |
for key in model_f0:
|
147 |
+
model_f0[key] = model_f0[key].eval().to(device).half()
|
|
|
148 |
print("[INFO] | F0 conditioned DiT model loaded and set to eval mode.")
|
149 |
|
150 |
model_f0.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
|
151 |
|
152 |
# Load F0 extractor
|
153 |
model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
|
154 |
+
rmvpe = RMVPE(model_path, is_half=True, device=device) # Ensure RMVPE supports half precision
|
155 |
+
print("[INFO] | RMVPE model loaded and converted to float16.")
|
156 |
|
157 |
mel_fn_args_f0 = {
|
158 |
"n_fft": config_f0['preprocess_params']['spect_params']['n_fft'],
|
|
|
169 |
# Load BigVGAN 44kHz model
|
170 |
bigvgan_44k_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_44khz_128band_512x', use_cuda_kernel=False)
|
171 |
bigvgan_44k_model.remove_weight_norm()
|
172 |
+
bigvgan_44k_model = bigvgan_44k_model.eval().to(device).half()
|
173 |
+
print("[INFO] | BigVGAN 44kHz model loaded, weight norm removed, set to eval mode, and converted to float16.")
|
174 |
|
175 |
# CSS Styling
|
176 |
css = '''
|
|
|
187 |
|
188 |
@torch.no_grad()
|
189 |
@torch.inference_mode()
|
190 |
+
def voice_conversion(input, reference, steps, guidance, pitch, speed):
|
191 |
+
# Load and process input audio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
source_audio, _ = librosa.load(input, sr=sr_current)
|
193 |
ref_audio, _ = librosa.load(reference, sr=sr_current)
|
194 |
+
source_audio_tensor = torch.tensor(source_audio, dtype=torch.float16).unsqueeze(0).to(device)
|
195 |
+
ref_audio_tensor = torch.tensor(ref_audio, dtype=torch.float16).unsqueeze(0).to(device)
|
196 |
+
|
197 |
+
# Generate Whisper features
|
198 |
+
alt_inputs = whisper_feature_extractor(
|
199 |
+
[source_audio_tensor.squeeze(0).cpu().numpy()],
|
200 |
+
return_tensors="pt",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
sampling_rate=sampling_rate
|
202 |
)
|
203 |
+
alt_input_features = whisper_model._mask_input_features(
|
204 |
+
alt_inputs.input_features.to(torch.float16),
|
205 |
+
attention_mask=alt_inputs.attention_mask
|
206 |
).to(device)
|
207 |
+
alt_outputs = whisper_model.encoder(alt_input_features).last_hidden_state.to(torch.float16)
|
208 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
# Generate mel spectrograms
|
210 |
+
mel = mel_fn(source_audio_tensor)
|
211 |
+
mel2 = mel_fn(ref_audio_tensor)
|
212 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
# Extract style features
|
|
|
214 |
feat2 = torchaudio.compliance.kaldi.fbank(
|
215 |
+
ref_audio_tensor, num_mel_bins=80, dither=0, sample_frequency=sampling_rate
|
|
|
|
|
|
|
216 |
)
|
217 |
+
style2 = campplus_model(feat2.unsqueeze(0).to(torch.float16))
|
218 |
+
|
219 |
+
# Length regulation
|
|
|
|
|
|
|
220 |
cond, _, _, _, _ = inference_module.length_regulator(
|
221 |
+
alt_outputs, ylens=target_lengths, n_quantizers=3, f0=None
|
222 |
)
|
223 |
prompt_condition, _, _, _, _ = inference_module.length_regulator(
|
224 |
+
mel2, ylens=target2_lengths, n_quantizers=3, f0=None
|
225 |
)
|
226 |
+
|
227 |
+
# Inference and waveform generation
|
|
|
|
|
|
|
228 |
generated_wave_chunks = []
|
|
|
|
|
|
|
229 |
while processed_frames < cond.size(1):
|
230 |
chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
|
231 |
+
cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1).to(torch.float16)
|
|
|
232 |
|
|
|
233 |
vc_target = inference_module.cfm.inference(
|
234 |
+
cat_condition,
|
235 |
+
torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
|
236 |
+
mel2, style2, None, steps, inference_cfg_rate=guidance
|
|
|
|
|
|
|
|
|
237 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
+
vc_wave = bigvgan_model(vc_target.float())[0].to(torch.float16)
|
240 |
+
generated_wave_chunks.append(vc_wave.cpu().numpy())
|
241 |
+
|
242 |
+
# Concatenate and process final audio
|
243 |
+
final_audio = np.concatenate(generated_wave_chunks).astype(np.float16)
|
244 |
+
final_audio = librosa.effects.pitch_shift(
|
245 |
+
final_audio.astype(np.float32), sr=sr_current, n_steps=pitch
|
246 |
+
).astype(np.float16)
|
247 |
+
final_audio /= np.max(np.abs(final_audio)).astype(np.float16)
|
248 |
+
|
249 |
+
# Save and return audio
|
250 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
|
251 |
+
sf.write(tmp_file.name, final_audio, sr_current, format='WAV')
|
252 |
+
return tmp_file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
def cloud():
|
255 |
print("[CLOUD] | Space maintained.")
|