NeoPy commited on
Commit
4f1099f
·
verified ·
1 Parent(s): 3ea8cc8

Update rvcinfpy/main.py

Browse files
Files changed (1) hide show
  1. rvcinfpy/main.py +855 -922
rvcinfpy/main.py CHANGED
@@ -1,922 +1,855 @@
1
- from rvcpy.lib.log_config import logger
2
- import torch
3
- import gc
4
- import numpy as np
5
- import os
6
- import warnings
7
- import threading
8
- from tqdm import tqdm
9
- from rvcpy.lib.infer_pack.models import (
10
- SynthesizerTrnMs256NSFsid,
11
- SynthesizerTrnMs256NSFsid_nono,
12
- SynthesizerTrnMs768NSFsid,
13
- SynthesizerTrnMs768NSFsid_nono,
14
- )
15
- from rvcpy.lib.audio import load_audio
16
- import soundfile as sf
17
- from scipy import signal
18
- from time import time as ttime
19
- import faiss
20
- from rvcpy.root_pipe import VC, change_rms, bh, ah
21
- import librosa
22
- from urllib.parse import urlparse
23
- import copy
24
-
25
- warnings.filterwarnings("ignore")
26
-
27
-
28
- class Config:
29
- def __init__(self, only_cpu=False):
30
- self.device = "cuda:0"
31
- self.is_half = True
32
- self.n_cpu = 0
33
- self.gpu_name = None
34
- self.gpu_mem = None
35
- (
36
- self.x_pad,
37
- self.x_query,
38
- self.x_center,
39
- self.x_max
40
- ) = self.device_config(only_cpu)
41
-
42
- def device_config(self, only_cpu) -> tuple:
43
- if torch.cuda.is_available() and not only_cpu:
44
- i_device = int(self.device.split(":")[-1])
45
- self.gpu_name = torch.cuda.get_device_name(i_device)
46
- if (
47
- ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
48
- or "P40" in self.gpu_name.upper()
49
- or "1060" in self.gpu_name
50
- or "1070" in self.gpu_name
51
- or "1080" in self.gpu_name
52
- ):
53
- logger.info(
54
- "16/10 Series GPUs and P40 excel "
55
- "in single-precision tasks."
56
- )
57
- self.is_half = False
58
- else:
59
- self.gpu_name = None
60
- self.gpu_mem = int(
61
- torch.cuda.get_device_properties(i_device).total_memory
62
- / 1024
63
- / 1024
64
- / 1024
65
- + 0.4
66
- )
67
- elif torch.backends.mps.is_available() and not only_cpu:
68
- logger.info("Supported N-card not found, using MPS for inference")
69
- self.device = "mps"
70
- else:
71
- logger.info("No supported N-card found, using CPU for inference")
72
- self.device = "cpu"
73
- self.is_half = False
74
-
75
- if self.n_cpu == 0:
76
- self.n_cpu = os.cpu_count()
77
-
78
- if self.is_half:
79
- # 6GB VRAM configuration
80
- x_pad = 3
81
- x_query = 10
82
- x_center = 60
83
- x_max = 65
84
- else:
85
- # 5GB VRAM configuration
86
- x_pad = 1
87
- x_query = 6
88
- x_center = 38
89
- x_max = 41
90
-
91
- if self.gpu_mem is not None and self.gpu_mem <= 4:
92
- x_pad = 1
93
- x_query = 5
94
- x_center = 30
95
- x_max = 32
96
-
97
- logger.info(
98
- f"Config: Device is {self.device}, "
99
- f"half precision is {self.is_half}"
100
- )
101
-
102
- return x_pad, x_query, x_center, x_max
103
-
104
-
105
- BASE_DOWNLOAD_LINK = "https://huggingface.co/r3gm/sonitranslate_voice_models/resolve/main/"
106
- BASE_MODELS = [
107
- "hubert_base.pt",
108
- "rmvpe.pt"
109
- ]
110
- BASE_DIR = "."
111
-
112
-
113
- def load_file_from_url(
114
- url: str,
115
- model_dir: str,
116
- file_name: str | None = None,
117
- overwrite: bool = False,
118
- progress: bool = True,
119
- ) -> str:
120
- """Download a file from `url` into `model_dir`,
121
- using the file present if possible.
122
-
123
- Returns the path to the downloaded file.
124
- """
125
- os.makedirs(model_dir, exist_ok=True)
126
- if not file_name:
127
- parts = urlparse(url)
128
- file_name = os.path.basename(parts.path)
129
- cached_file = os.path.abspath(os.path.join(model_dir, file_name))
130
-
131
- # Overwrite
132
- if os.path.exists(cached_file):
133
- if overwrite or os.path.getsize(cached_file) == 0:
134
- os.remove(cached_file)
135
-
136
- # Download
137
- if not os.path.exists(cached_file):
138
- logger.info(f'Downloading: "{url}" to {cached_file}\n')
139
- from torch.hub import download_url_to_file
140
-
141
- download_url_to_file(url, cached_file, progress=progress)
142
- else:
143
- logger.debug(cached_file)
144
-
145
- return cached_file
146
-
147
-
148
- def friendly_name(file: str):
149
- if file.startswith("http"):
150
- file = urlparse(file).path
151
-
152
- file = os.path.basename(file)
153
- model_name, extension = os.path.splitext(file)
154
- return model_name, extension
155
-
156
-
157
- def download_manager(
158
- url: str,
159
- path: str,
160
- extension: str = "",
161
- overwrite: bool = False,
162
- progress: bool = True,
163
- ):
164
- url = url.strip()
165
-
166
- name, ext = friendly_name(url)
167
- name += ext if not extension else f".{extension}"
168
-
169
- if url.startswith("http"):
170
- filename = load_file_from_url(
171
- url=url,
172
- model_dir=path,
173
- file_name=name,
174
- overwrite=overwrite,
175
- progress=progress,
176
- )
177
- else:
178
- filename = path
179
-
180
- return filename
181
-
182
-
183
- def load_hu_bert(config, hubert_path=None):
184
- from fairseq2 import checkpoint_utils
185
-
186
- if hubert_path is None:
187
- hubert_path = ""
188
- if not os.path.exists(hubert_path):
189
- for id_model in BASE_MODELS:
190
- download_manager(
191
- os.path.join(BASE_DOWNLOAD_LINK, id_model), BASE_DIR
192
- )
193
- hubert_path = "hubert_base.pt"
194
-
195
- models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
196
- [hubert_path],
197
- suffix="",
198
- )
199
- hubert_model = models[0]
200
- hubert_model = hubert_model.to(config.device)
201
- if config.is_half:
202
- hubert_model = hubert_model.half()
203
- else:
204
- hubert_model = hubert_model.float()
205
- hubert_model.eval()
206
-
207
- return hubert_model
208
-
209
-
210
- def load_trained_model(model_path, config):
211
-
212
- if not model_path:
213
- raise ValueError("No model found")
214
-
215
- logger.info("Loading %s" % model_path)
216
- cpt = torch.load(model_path, map_location="cpu")
217
- tgt_sr = cpt["config"][-1]
218
- cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
219
- if_f0 = cpt.get("f0", 1)
220
- if if_f0 == 0:
221
- # protect to 0.5 need?
222
- pass
223
-
224
- version = cpt.get("version", "v1")
225
- if version == "v1":
226
- if if_f0 == 1:
227
- net_g = SynthesizerTrnMs256NSFsid(
228
- *cpt["config"], is_half=config.is_half
229
- )
230
- else:
231
- net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
232
- elif version == "v2":
233
- if if_f0 == 1:
234
- net_g = SynthesizerTrnMs768NSFsid(
235
- *cpt["config"], is_half=config.is_half
236
- )
237
- else:
238
- net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
239
- del net_g.enc_q
240
-
241
- net_g.load_state_dict(cpt["weight"], strict=False)
242
- net_g.eval().to(config.device)
243
-
244
- if config.is_half:
245
- net_g = net_g.half()
246
- else:
247
- net_g = net_g.float()
248
-
249
- vc = VC(tgt_sr, config)
250
- n_spk = cpt["config"][-3]
251
-
252
- return n_spk, tgt_sr, net_g, vc, cpt, version
253
-
254
-
255
- class BaseLoader:
256
- def __init__(self, only_cpu=False, hubert_path=None, rmvpe_path=None):
257
- self.model_config = {}
258
- self.config = None
259
- self.cache_model = {}
260
- self.only_cpu = only_cpu
261
- self.hubert_path = hubert_path
262
- self.rmvpe_path = rmvpe_path
263
-
264
- def apply_conf(
265
- self,
266
- tag="base_model",
267
- file_model="",
268
- pitch_algo="pm",
269
- pitch_lvl=0,
270
- file_index="",
271
- index_influence=0.66,
272
- respiration_median_filtering=3,
273
- envelope_ratio=0.25,
274
- consonant_breath_protection=0.33,
275
- resample_sr=0,
276
- file_pitch_algo="",
277
- ):
278
-
279
- if not file_model:
280
- raise ValueError("Model not found")
281
-
282
- if file_index is None:
283
- file_index = ""
284
-
285
- if file_pitch_algo is None:
286
- file_pitch_algo = ""
287
-
288
- if not self.config:
289
- self.config = Config(self.only_cpu)
290
- self.hu_bert_model = None
291
- self.model_pitch_estimator = None
292
-
293
- self.model_config[tag] = {
294
- "file_model": file_model,
295
- "pitch_algo": pitch_algo,
296
- "pitch_lvl": pitch_lvl, # no decimal
297
- "file_index": file_index,
298
- "index_influence": index_influence,
299
- "respiration_median_filtering": respiration_median_filtering,
300
- "envelope_ratio": envelope_ratio,
301
- "consonant_breath_protection": consonant_breath_protection,
302
- "resample_sr": resample_sr,
303
- "file_pitch_algo": file_pitch_algo,
304
- }
305
- return f"CONFIGURATION APPLIED FOR {tag}: {file_model}"
306
-
307
- def infer(
308
- self,
309
- task_id,
310
- params,
311
- # load model
312
- n_spk,
313
- tgt_sr,
314
- net_g,
315
- pipe,
316
- cpt,
317
- version,
318
- if_f0,
319
- # load index
320
- index_rate,
321
- index,
322
- big_npy,
323
- # load f0 file
324
- inp_f0,
325
- # audio file
326
- input_audio_path,
327
- overwrite,
328
- type_output,
329
- ):
330
-
331
- f0_method = params["pitch_algo"]
332
- f0_up_key = params["pitch_lvl"]
333
- filter_radius = params["respiration_median_filtering"]
334
- resample_sr = params["resample_sr"]
335
- rms_mix_rate = params["envelope_ratio"]
336
- protect = params["consonant_breath_protection"]
337
- base_sr = 16000
338
-
339
- if isinstance(input_audio_path, tuple):
340
- if f0_method == "harvest":
341
- raise ValueError("Harvest not support from array")
342
- audio = input_audio_path[0]
343
- source_sr = input_audio_path[1]
344
- if source_sr != base_sr:
345
- audio = librosa.resample(
346
- audio.astype(np.float32),
347
- orig_sr=source_sr,
348
- target_sr=base_sr
349
- )
350
- audio = audio.astype(np.float32).flatten()
351
- elif not os.path.exists(input_audio_path):
352
- raise ValueError(
353
- "The audio file was not found or is not "
354
- f"a valid file: {input_audio_path}"
355
- )
356
- else:
357
- audio = load_audio(input_audio_path, base_sr)
358
-
359
- f0_up_key = int(f0_up_key)
360
-
361
- # Normalize audio
362
- audio_max = np.abs(audio).max() / 0.95
363
- if audio_max > 1:
364
- audio /= audio_max
365
-
366
- times = [0, 0, 0]
367
-
368
- # filters audio signal, pads it, computes sliding window sums,
369
- # and extracts optimized time indices
370
- audio = signal.filtfilt(bh, ah, audio)
371
- audio_pad = np.pad(
372
- audio, (pipe.window // 2, pipe.window // 2), mode="reflect"
373
- )
374
- opt_ts = []
375
- if audio_pad.shape[0] > pipe.t_max:
376
- audio_sum = np.zeros_like(audio)
377
- for i in range(pipe.window):
378
- audio_sum += audio_pad[i:i - pipe.window]
379
- for t in range(pipe.t_center, audio.shape[0], pipe.t_center):
380
- opt_ts.append(
381
- t
382
- - pipe.t_query
383
- + np.where(
384
- np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query])
385
- == np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query]).min()
386
- )[0][0]
387
- )
388
-
389
- s = 0
390
- audio_opt = []
391
- t = None
392
- t1 = ttime()
393
-
394
- sid_value = 0
395
- sid = torch.tensor(sid_value, device=pipe.device).unsqueeze(0).long()
396
-
397
- # Pads audio symmetrically, calculates length divided by window size.
398
- audio_pad = np.pad(audio, (pipe.t_pad, pipe.t_pad), mode="reflect")
399
- p_len = audio_pad.shape[0] // pipe.window
400
-
401
- # Estimates pitch from audio signal
402
- pitch, pitchf = None, None
403
- if if_f0 == 1:
404
- pitch, pitchf = pipe.get_f0(
405
- input_audio_path,
406
- audio_pad,
407
- p_len,
408
- f0_up_key,
409
- f0_method,
410
- filter_radius,
411
- inp_f0,
412
- )
413
- pitch = pitch[:p_len]
414
- pitchf = pitchf[:p_len]
415
- if pipe.device == "mps":
416
- pitchf = pitchf.astype(np.float32)
417
- pitch = torch.tensor(
418
- pitch, device=pipe.device
419
- ).unsqueeze(0).long()
420
- pitchf = torch.tensor(
421
- pitchf, device=pipe.device
422
- ).unsqueeze(0).float()
423
-
424
- t2 = ttime()
425
- times[1] += t2 - t1
426
- for t in opt_ts:
427
- t = t // pipe.window * pipe.window
428
- if if_f0 == 1:
429
- pitch_slice = pitch[
430
- :, s // pipe.window: (t + pipe.t_pad2) // pipe.window
431
- ]
432
- pitchf_slice = pitchf[
433
- :, s // pipe.window: (t + pipe.t_pad2) // pipe.window
434
- ]
435
- else:
436
- pitch_slice = None
437
- pitchf_slice = None
438
-
439
- audio_slice = audio_pad[s:t + pipe.t_pad2 + pipe.window]
440
- audio_opt.append(
441
- pipe.vc(
442
- self.hu_bert_model,
443
- net_g,
444
- sid,
445
- audio_slice,
446
- pitch_slice,
447
- pitchf_slice,
448
- times,
449
- index,
450
- big_npy,
451
- index_rate,
452
- version,
453
- protect,
454
- )[pipe.t_pad_tgt:-pipe.t_pad_tgt]
455
- )
456
- s = t
457
-
458
- pitch_end_slice = pitch[
459
- :, t // pipe.window:
460
- ] if t is not None else pitch
461
- pitchf_end_slice = pitchf[
462
- :, t // pipe.window:
463
- ] if t is not None else pitchf
464
-
465
- audio_opt.append(
466
- pipe.vc(
467
- self.hu_bert_model,
468
- net_g,
469
- sid,
470
- audio_pad[t:],
471
- pitch_end_slice,
472
- pitchf_end_slice,
473
- times,
474
- index,
475
- big_npy,
476
- index_rate,
477
- version,
478
- protect,
479
- )[pipe.t_pad_tgt:-pipe.t_pad_tgt]
480
- )
481
-
482
- audio_opt = np.concatenate(audio_opt)
483
- if rms_mix_rate != 1:
484
- audio_opt = change_rms(
485
- audio, 16000, audio_opt, tgt_sr, rms_mix_rate
486
- )
487
- if resample_sr >= 16000 and tgt_sr != resample_sr:
488
- audio_opt = librosa.resample(
489
- audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
490
- )
491
- audio_max = np.abs(audio_opt).max() / 0.99
492
- max_int16 = 32768
493
- if audio_max > 1:
494
- max_int16 /= audio_max
495
- audio_opt = (audio_opt * max_int16).astype(np.int16)
496
- del pitch, pitchf, sid
497
- if torch.cuda.is_available():
498
- torch.cuda.empty_cache()
499
-
500
- if tgt_sr != resample_sr >= 16000:
501
- final_sr = resample_sr
502
- else:
503
- final_sr = tgt_sr
504
-
505
- """
506
- "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
507
- times[0],
508
- times[1],
509
- times[2],
510
- ), (final_sr, audio_opt)
511
-
512
- """
513
-
514
- if type_output == "array":
515
- return audio_opt, final_sr
516
-
517
- if overwrite:
518
- output_audio_path = input_audio_path # Overwrite
519
- else:
520
- basename = os.path.basename(input_audio_path)
521
- dirname = os.path.dirname(input_audio_path)
522
-
523
- new_basename = basename.split(
524
- '.')[0] + "_edited." + basename.split('.')[-1]
525
- new_path = os.path.join(dirname, new_basename)
526
-
527
- output_audio_path = new_path
528
-
529
- # Save file
530
- if type_output:
531
- output_audio_path = os.path.splitext(
532
- output_audio_path
533
- )[0]+f".{type_output}"
534
-
535
- try:
536
- sf.write(
537
- file=output_audio_path,
538
- samplerate=final_sr,
539
- data=audio_opt
540
- )
541
- except Exception as e:
542
- logger.error(e)
543
- logger.error("Error saving file, trying with WAV format")
544
- output_audio_path = os.path.splitext(output_audio_path)[0]+".wav"
545
- sf.write(
546
- file=output_audio_path,
547
- samplerate=final_sr,
548
- data=audio_opt
549
- )
550
-
551
- logger.info(str(output_audio_path))
552
-
553
- self.model_config[task_id]["result"].append(output_audio_path)
554
- self.output_list.append(output_audio_path)
555
-
556
- def run_threads(self, threads):
557
- # Start threads
558
- for thread in threads:
559
- thread.start()
560
-
561
- # Wait for all threads to finish
562
- for thread in threads:
563
- thread.join()
564
-
565
- gc.collect()
566
- torch.cuda.empty_cache()
567
-
568
- def unload_models(self):
569
- self.hu_bert_model = None
570
- self.model_pitch_estimator = None
571
- self.model_vc = {}
572
- self.cache_model = {}
573
- gc.collect()
574
- torch.cuda.empty_cache()
575
-
576
- def __call__(
577
- self,
578
- audio_files=[],
579
- tag_list=[],
580
- overwrite=False,
581
- parallel_workers=1,
582
- type_output=None, # ["mp3", "wav", "ogg", "flac"]
583
- ):
584
- logger.info(f"Parallel workers: {str(parallel_workers)}")
585
-
586
- self.output_list = []
587
-
588
- if not self.model_config:
589
- raise ValueError("No model has been configured for inference")
590
-
591
- if isinstance(audio_files, str):
592
- audio_files = [audio_files]
593
- if isinstance(tag_list, str):
594
- tag_list = [tag_list]
595
-
596
- if not audio_files:
597
- raise ValueError("No audio found to convert")
598
- if not tag_list:
599
- tag_list = [list(self.model_config.keys())[-1]] * len(audio_files)
600
-
601
- if len(audio_files) > len(tag_list):
602
- logger.info("Extend tag list to match audio files")
603
- extend_number = len(audio_files) - len(tag_list)
604
- tag_list.extend([tag_list[0]] * extend_number)
605
-
606
- if len(audio_files) < len(tag_list):
607
- logger.info("Cut list tags")
608
- tag_list = tag_list[:len(audio_files)]
609
-
610
- tag_file_pairs = list(zip(tag_list, audio_files))
611
- sorted_tag_file = sorted(tag_file_pairs, key=lambda x: x[0])
612
-
613
- # Base params
614
- if not self.hu_bert_model:
615
- self.hu_bert_model = load_hu_bert(self.config, self.hubert_path)
616
-
617
- cache_params = None
618
- threads = []
619
- progress_bar = tqdm(total=len(tag_list), desc="Progress")
620
- for i, (id_tag, input_audio_path) in enumerate(sorted_tag_file):
621
-
622
- if id_tag not in self.model_config.keys():
623
- logger.info(
624
- f"No configured model for {id_tag} with {input_audio_path}"
625
- )
626
- continue
627
-
628
- if (
629
- len(threads) >= parallel_workers
630
- or cache_params != id_tag
631
- and cache_params is not None
632
- ):
633
-
634
- self.run_threads(threads)
635
- progress_bar.update(len(threads))
636
-
637
- threads = []
638
-
639
- if cache_params != id_tag:
640
-
641
- self.model_config[id_tag]["result"] = []
642
-
643
- # Unload previous
644
- (
645
- n_spk,
646
- tgt_sr,
647
- net_g,
648
- pipe,
649
- cpt,
650
- version,
651
- if_f0,
652
- index_rate,
653
- index,
654
- big_npy,
655
- inp_f0,
656
- ) = [None] * 11
657
- gc.collect()
658
- torch.cuda.empty_cache()
659
-
660
- # Model params
661
- params = self.model_config[id_tag]
662
-
663
- model_path = params["file_model"]
664
- f0_method = params["pitch_algo"]
665
- file_index = params["file_index"]
666
- index_rate = params["index_influence"]
667
- f0_file = params["file_pitch_algo"]
668
-
669
- # Load model
670
- (
671
- n_spk,
672
- tgt_sr,
673
- net_g,
674
- pipe,
675
- cpt,
676
- version
677
- ) = load_trained_model(model_path, self.config)
678
- if_f0 = cpt.get("f0", 1) # pitch data
679
-
680
- # Load index
681
- if os.path.exists(file_index) and index_rate != 0:
682
- try:
683
- index = faiss.read_index(file_index)
684
- big_npy = index.reconstruct_n(0, index.ntotal)
685
- except Exception as error:
686
- logger.error(f"Index: {str(error)}")
687
- index_rate = 0
688
- index = big_npy = None
689
- else:
690
- logger.warning("File index not found")
691
- index_rate = 0
692
- index = big_npy = None
693
-
694
- # Load f0 file
695
- inp_f0 = None
696
- if os.path.exists(f0_file):
697
- try:
698
- with open(f0_file, "r") as f:
699
- lines = f.read().strip("\n").split("\n")
700
- inp_f0 = []
701
- for line in lines:
702
- inp_f0.append([float(i) for i in line.split(",")])
703
- inp_f0 = np.array(inp_f0, dtype="float32")
704
- except Exception as error:
705
- logger.error(f"f0 file: {str(error)}")
706
-
707
- if "rmvpe" in f0_method:
708
- if not self.model_pitch_estimator:
709
- from infer_rvc_python.lib.rmvpe import RMVPE
710
-
711
- logger.info("Loading vocal pitch estimator model")
712
- if self.rmvpe_path is None:
713
- self.rmvpe_path = ""
714
- rm_local_path = "rmvpe.pt"
715
- if os.path.exists(self.rmvpe_path):
716
- rm_local_path = self.rmvpe_path
717
- self.model_pitch_estimator = RMVPE(
718
- rm_local_path,
719
- is_half=self.config.is_half,
720
- device=self.config.device
721
- )
722
-
723
- pipe.model_rmvpe = self.model_pitch_estimator
724
-
725
- cache_params = id_tag
726
-
727
- # self.infer(
728
- # id_tag,
729
- # params,
730
- # # load model
731
- # n_spk,
732
- # tgt_sr,
733
- # net_g,
734
- # pipe,
735
- # cpt,
736
- # version,
737
- # if_f0,
738
- # # load index
739
- # index_rate,
740
- # index,
741
- # big_npy,
742
- # # load f0 file
743
- # inp_f0,
744
- # # output file
745
- # input_audio_path,
746
- # overwrite,
747
- # type_output,
748
- # )
749
-
750
- thread = threading.Thread(
751
- target=self.infer,
752
- args=(
753
- id_tag,
754
- params,
755
- # loaded model
756
- n_spk,
757
- tgt_sr,
758
- net_g,
759
- pipe,
760
- cpt,
761
- version,
762
- if_f0,
763
- # loaded index
764
- index_rate,
765
- index,
766
- big_npy,
767
- # loaded f0 file
768
- inp_f0,
769
- # audio file
770
- input_audio_path,
771
- overwrite,
772
- type_output,
773
- )
774
- )
775
-
776
- threads.append(thread)
777
-
778
- # Run last
779
- if threads:
780
- self.run_threads(threads)
781
-
782
- progress_bar.update(len(threads))
783
- progress_bar.close()
784
-
785
- final_result = []
786
- valid_tags = set(tag_list)
787
- for tag in valid_tags:
788
- if (
789
- tag in self.model_config.keys()
790
- and "result" in self.model_config[tag].keys()
791
- ):
792
- final_result.extend(self.model_config[tag]["result"])
793
-
794
- return final_result
795
-
796
- def generate_from_cache(
797
- self,
798
- audio_data=None, # str or tuple (<array data>,<int sampling rate>)
799
- tag=None,
800
- reload=False,
801
- ):
802
-
803
- if not self.model_config:
804
- raise ValueError("No model has been configured for inference")
805
-
806
- if not audio_data:
807
- raise ValueError(
808
- "An audio file or tuple with "
809
- "(<numpy data audio>,<sampling rate>) is needed"
810
- )
811
-
812
- # Base params
813
- if not self.hu_bert_model:
814
- self.hu_bert_model = load_hu_bert(self.config, self.hubert_path)
815
-
816
- if tag not in self.model_config.keys():
817
- raise ValueError(
818
- f"No configured model for {tag}"
819
- )
820
-
821
- now_data = self.model_config[tag]
822
- now_data["tag"] = tag
823
-
824
- if self.cache_model != now_data and not reload:
825
-
826
- # Unload previous
827
- self.model_vc = {}
828
- gc.collect()
829
- torch.cuda.empty_cache()
830
-
831
- model_path = now_data["file_model"]
832
- f0_method = now_data["pitch_algo"]
833
- file_index = now_data["file_index"]
834
- index_rate = now_data["index_influence"]
835
- f0_file = now_data["file_pitch_algo"]
836
-
837
- # Load model
838
- (
839
- self.model_vc["n_spk"],
840
- self.model_vc["tgt_sr"],
841
- self.model_vc["net_g"],
842
- self.model_vc["pipe"],
843
- self.model_vc["cpt"],
844
- self.model_vc["version"]
845
- ) = load_trained_model(model_path, self.config)
846
- self.model_vc["if_f0"] = self.model_vc["cpt"].get("f0", 1)
847
-
848
- # Load index
849
- if os.path.exists(file_index) and index_rate != 0:
850
- try:
851
- index = faiss.read_index(file_index)
852
- big_npy = index.reconstruct_n(0, index.ntotal)
853
- except Exception as error:
854
- logger.error(f"Index: {str(error)}")
855
- index_rate = 0
856
- index = big_npy = None
857
- else:
858
- logger.warning("File index not found")
859
- index_rate = 0
860
- index = big_npy = None
861
-
862
- self.model_vc["index_rate"] = index_rate
863
- self.model_vc["index"] = index
864
- self.model_vc["big_npy"] = big_npy
865
-
866
- # Load f0 file
867
- inp_f0 = None
868
- if os.path.exists(f0_file):
869
- try:
870
- with open(f0_file, "r") as f:
871
- lines = f.read().strip("\n").split("\n")
872
- inp_f0 = []
873
- for line in lines:
874
- inp_f0.append([float(i) for i in line.split(",")])
875
- inp_f0 = np.array(inp_f0, dtype="float32")
876
- except Exception as error:
877
- logger.error(f"f0 file: {str(error)}")
878
-
879
- self.model_vc["inp_f0"] = inp_f0
880
-
881
- if "rmvpe" in f0_method:
882
- if not self.model_pitch_estimator:
883
- from infer_rvc_python.lib.rmvpe import RMVPE
884
-
885
- logger.info("Loading vocal pitch estimator model")
886
- if self.rmvpe_path is None:
887
- self.rmvpe_path = ""
888
- rm_local_path = "rmvpe.pt"
889
- if os.path.exists(self.rmvpe_path):
890
- rm_local_path = self.rmvpe_path
891
- self.model_pitch_estimator = RMVPE(
892
- rm_local_path,
893
- is_half=self.config.is_half,
894
- device=self.config.device
895
- )
896
-
897
- self.model_vc["pipe"].model_rmvpe = self.model_pitch_estimator
898
-
899
- self.cache_model = copy.deepcopy(now_data)
900
-
901
- return self.infer(
902
- tag,
903
- now_data,
904
- # load model
905
- self.model_vc["n_spk"],
906
- self.model_vc["tgt_sr"],
907
- self.model_vc["net_g"],
908
- self.model_vc["pipe"],
909
- self.model_vc["cpt"],
910
- self.model_vc["version"],
911
- self.model_vc["if_f0"],
912
- # load index
913
- self.model_vc["index_rate"],
914
- self.model_vc["index"],
915
- self.model_vc["big_npy"],
916
- # load f0 file
917
- self.model_vc["inp_f0"],
918
- # output file
919
- audio_data,
920
- False,
921
- "array",
922
- )
 
1
+ from rvcinfpy.lib.log_config import logger
2
+ import torch
3
+ import gc
4
+ import numpy as np
5
+ import os
6
+ import warnings
7
+ import threading
8
+ from tqdm import tqdm
9
+ from rvcinfpy.lib.infer_pack.models import (
10
+ SynthesizerTrnMs256NSFsid,
11
+ SynthesizerTrnMs256NSFsid_nono,
12
+ SynthesizerTrnMs768NSFsid,
13
+ SynthesizerTrnMs768NSFsid_nono,
14
+ )
15
+ from rvcinfpy.lib.audio import load_audio
16
+ import soundfile as sf
17
+ from scipy import signal
18
+ from time import time as ttime
19
+ import faiss
20
+ from rvcinfpy.root_pipe import VC, change_rms, bh, ah
21
+ import librosa
22
+ from urllib.parse import urlparse
23
+ import copy
24
+ from rvcinferpy.utils import download_manager, load_file_from_url
25
+ warnings.filterwarnings("ignore")
26
+
27
+
28
+ class Config:
29
+ def __init__(self, only_cpu=False):
30
+ self.device = "cuda:0"
31
+ self.is_half = True
32
+ self.n_cpu = 0
33
+ self.gpu_name = None
34
+ self.gpu_mem = None
35
+ (
36
+ self.x_pad,
37
+ self.x_query,
38
+ self.x_center,
39
+ self.x_max
40
+ ) = self.device_config(only_cpu)
41
+
42
+ def device_config(self, only_cpu) -> tuple:
43
+ if torch.cuda.is_available() and not only_cpu:
44
+ i_device = int(self.device.split(":")[-1])
45
+ self.gpu_name = torch.cuda.get_device_name(i_device)
46
+ if (
47
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
48
+ or "P40" in self.gpu_name.upper()
49
+ or "1060" in self.gpu_name
50
+ or "1070" in self.gpu_name
51
+ or "1080" in self.gpu_name
52
+ ):
53
+ logger.info(
54
+ "16/10 Series GPUs and P40 excel "
55
+ "in single-precision tasks."
56
+ )
57
+ self.is_half = False
58
+ else:
59
+ self.gpu_name = None
60
+ self.gpu_mem = int(
61
+ torch.cuda.get_device_properties(i_device).total_memory
62
+ / 1024
63
+ / 1024
64
+ / 1024
65
+ + 0.4
66
+ )
67
+ elif torch.backends.mps.is_available() and not only_cpu:
68
+ logger.info("Supported N-card not found, using MPS for inference")
69
+ self.device = "mps"
70
+ else:
71
+ logger.info("No supported N-card found, using CPU for inference")
72
+ self.device = "cpu"
73
+ self.is_half = False
74
+
75
+ if self.n_cpu == 0:
76
+ self.n_cpu = os.cpu_count()
77
+
78
+ if self.is_half:
79
+ # 6GB VRAM configuration
80
+ x_pad = 3
81
+ x_query = 10
82
+ x_center = 60
83
+ x_max = 65
84
+ else:
85
+ # 5GB VRAM configuration
86
+ x_pad = 1
87
+ x_query = 6
88
+ x_center = 38
89
+ x_max = 41
90
+
91
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
92
+ x_pad = 1
93
+ x_query = 5
94
+ x_center = 30
95
+ x_max = 32
96
+
97
+ logger.info(
98
+ f"Config: Device is {self.device}, "
99
+ f"half precision is {self.is_half}"
100
+ )
101
+
102
+ return x_pad, x_query, x_center, x_max
103
+
104
+
105
+ BASE_DOWNLOAD_LINK = "https://huggingface.co/r3gm/sonitranslate_voice_models/resolve/main/"
106
+ BASE_MODELS = [
107
+ "hubert_base.pt",
108
+ "rmvpe.pt"
109
+ ]
110
+ BASE_DIR = "."
111
+
112
+
113
+
114
+
115
+
116
+ def load_hu_bert(config, hubert_path=None):
117
+ from fairseq2 import checkpoint_utils
118
+
119
+ if hubert_path is None:
120
+ hubert_path = ""
121
+ if not os.path.exists(hubert_path):
122
+ for id_model in BASE_MODELS:
123
+ download_manager(
124
+ os.path.join(BASE_DOWNLOAD_LINK, id_model), BASE_DIR
125
+ )
126
+ hubert_path = "hubert_base.pt"
127
+
128
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
129
+ [hubert_path],
130
+ suffix="",
131
+ )
132
+ hubert_model = models[0]
133
+ hubert_model = hubert_model.to(config.device)
134
+ if config.is_half:
135
+ hubert_model = hubert_model.half()
136
+ else:
137
+ hubert_model = hubert_model.float()
138
+ hubert_model.eval()
139
+
140
+ return hubert_model
141
+
142
+
143
+ def load_trained_model(model_path, config):
144
+
145
+ if not model_path:
146
+ raise ValueError("No model found")
147
+
148
+ logger.info("Loading %s" % model_path)
149
+ cpt = torch.load(model_path, map_location="cpu")
150
+ tgt_sr = cpt["config"][-1]
151
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
152
+ if_f0 = cpt.get("f0", 1)
153
+ if if_f0 == 0:
154
+ # protect to 0.5 need?
155
+ pass
156
+
157
+ version = cpt.get("version", "v1")
158
+ if version == "v1":
159
+ if if_f0 == 1:
160
+ net_g = SynthesizerTrnMs256NSFsid(
161
+ *cpt["config"], is_half=config.is_half
162
+ )
163
+ else:
164
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
165
+ elif version == "v2":
166
+ if if_f0 == 1:
167
+ net_g = SynthesizerTrnMs768NSFsid(
168
+ *cpt["config"], is_half=config.is_half
169
+ )
170
+ else:
171
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
172
+ del net_g.enc_q
173
+
174
+ net_g.load_state_dict(cpt["weight"], strict=False)
175
+ net_g.eval().to(config.device)
176
+
177
+ if config.is_half:
178
+ net_g = net_g.half()
179
+ else:
180
+ net_g = net_g.float()
181
+
182
+ vc = VC(tgt_sr, config)
183
+ n_spk = cpt["config"][-3]
184
+
185
+ return n_spk, tgt_sr, net_g, vc, cpt, version
186
+
187
+
188
+ class BaseLoader:
189
+ def __init__(self, only_cpu=False, hubert_path=None, rmvpe_path=None):
190
+ self.model_config = {}
191
+ self.config = None
192
+ self.cache_model = {}
193
+ self.only_cpu = only_cpu
194
+ self.hubert_path = hubert_path
195
+ self.rmvpe_path = rmvpe_path
196
+
197
+ def apply_conf(
198
+ self,
199
+ tag="base_model",
200
+ file_model="",
201
+ pitch_algo="pm",
202
+ pitch_lvl=0,
203
+ file_index="",
204
+ index_influence=0.66,
205
+ respiration_median_filtering=3,
206
+ envelope_ratio=0.25,
207
+ consonant_breath_protection=0.33,
208
+ resample_sr=0,
209
+ file_pitch_algo="",
210
+ ):
211
+
212
+ if not file_model:
213
+ raise ValueError("Model not found")
214
+
215
+ if file_index is None:
216
+ file_index = ""
217
+
218
+ if file_pitch_algo is None:
219
+ file_pitch_algo = ""
220
+
221
+ if not self.config:
222
+ self.config = Config(self.only_cpu)
223
+ self.hu_bert_model = None
224
+ self.model_pitch_estimator = None
225
+
226
+ self.model_config[tag] = {
227
+ "file_model": file_model,
228
+ "pitch_algo": pitch_algo,
229
+ "pitch_lvl": pitch_lvl, # no decimal
230
+ "file_index": file_index,
231
+ "index_influence": index_influence,
232
+ "respiration_median_filtering": respiration_median_filtering,
233
+ "envelope_ratio": envelope_ratio,
234
+ "consonant_breath_protection": consonant_breath_protection,
235
+ "resample_sr": resample_sr,
236
+ "file_pitch_algo": file_pitch_algo,
237
+ }
238
+ return f"CONFIGURATION APPLIED FOR {tag}: {file_model}"
239
+
240
+ def infer(
241
+ self,
242
+ task_id,
243
+ params,
244
+ # load model
245
+ n_spk,
246
+ tgt_sr,
247
+ net_g,
248
+ pipe,
249
+ cpt,
250
+ version,
251
+ if_f0,
252
+ # load index
253
+ index_rate,
254
+ index,
255
+ big_npy,
256
+ # load f0 file
257
+ inp_f0,
258
+ # audio file
259
+ input_audio_path,
260
+ overwrite,
261
+ type_output,
262
+ ):
263
+
264
+ f0_method = params["pitch_algo"]
265
+ f0_up_key = params["pitch_lvl"]
266
+ filter_radius = params["respiration_median_filtering"]
267
+ resample_sr = params["resample_sr"]
268
+ rms_mix_rate = params["envelope_ratio"]
269
+ protect = params["consonant_breath_protection"]
270
+ base_sr = 16000
271
+
272
+ if isinstance(input_audio_path, tuple):
273
+ if f0_method == "harvest":
274
+ raise ValueError("Harvest not support from array")
275
+ audio = input_audio_path[0]
276
+ source_sr = input_audio_path[1]
277
+ if source_sr != base_sr:
278
+ audio = librosa.resample(
279
+ audio.astype(np.float32),
280
+ orig_sr=source_sr,
281
+ target_sr=base_sr
282
+ )
283
+ audio = audio.astype(np.float32).flatten()
284
+ elif not os.path.exists(input_audio_path):
285
+ raise ValueError(
286
+ "The audio file was not found or is not "
287
+ f"a valid file: {input_audio_path}"
288
+ )
289
+ else:
290
+ audio = load_audio(input_audio_path, base_sr)
291
+
292
+ f0_up_key = int(f0_up_key)
293
+
294
+ # Normalize audio
295
+ audio_max = np.abs(audio).max() / 0.95
296
+ if audio_max > 1:
297
+ audio /= audio_max
298
+
299
+ times = [0, 0, 0]
300
+
301
+ # filters audio signal, pads it, computes sliding window sums,
302
+ # and extracts optimized time indices
303
+ audio = signal.filtfilt(bh, ah, audio)
304
+ audio_pad = np.pad(
305
+ audio, (pipe.window // 2, pipe.window // 2), mode="reflect"
306
+ )
307
+ opt_ts = []
308
+ if audio_pad.shape[0] > pipe.t_max:
309
+ audio_sum = np.zeros_like(audio)
310
+ for i in range(pipe.window):
311
+ audio_sum += audio_pad[i:i - pipe.window]
312
+ for t in range(pipe.t_center, audio.shape[0], pipe.t_center):
313
+ opt_ts.append(
314
+ t
315
+ - pipe.t_query
316
+ + np.where(
317
+ np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query])
318
+ == np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query]).min()
319
+ )[0][0]
320
+ )
321
+
322
+ s = 0
323
+ audio_opt = []
324
+ t = None
325
+ t1 = ttime()
326
+
327
+ sid_value = 0
328
+ sid = torch.tensor(sid_value, device=pipe.device).unsqueeze(0).long()
329
+
330
+ # Pads audio symmetrically, calculates length divided by window size.
331
+ audio_pad = np.pad(audio, (pipe.t_pad, pipe.t_pad), mode="reflect")
332
+ p_len = audio_pad.shape[0] // pipe.window
333
+
334
+ # Estimates pitch from audio signal
335
+ pitch, pitchf = None, None
336
+ if if_f0 == 1:
337
+ pitch, pitchf = pipe.get_f0(
338
+ input_audio_path,
339
+ audio_pad,
340
+ p_len,
341
+ f0_up_key,
342
+ f0_method,
343
+ filter_radius,
344
+ inp_f0,
345
+ )
346
+ pitch = pitch[:p_len]
347
+ pitchf = pitchf[:p_len]
348
+ if pipe.device == "mps":
349
+ pitchf = pitchf.astype(np.float32)
350
+ pitch = torch.tensor(
351
+ pitch, device=pipe.device
352
+ ).unsqueeze(0).long()
353
+ pitchf = torch.tensor(
354
+ pitchf, device=pipe.device
355
+ ).unsqueeze(0).float()
356
+
357
+ t2 = ttime()
358
+ times[1] += t2 - t1
359
+ for t in opt_ts:
360
+ t = t // pipe.window * pipe.window
361
+ if if_f0 == 1:
362
+ pitch_slice = pitch[
363
+ :, s // pipe.window: (t + pipe.t_pad2) // pipe.window
364
+ ]
365
+ pitchf_slice = pitchf[
366
+ :, s // pipe.window: (t + pipe.t_pad2) // pipe.window
367
+ ]
368
+ else:
369
+ pitch_slice = None
370
+ pitchf_slice = None
371
+
372
+ audio_slice = audio_pad[s:t + pipe.t_pad2 + pipe.window]
373
+ audio_opt.append(
374
+ pipe.vc(
375
+ self.hu_bert_model,
376
+ net_g,
377
+ sid,
378
+ audio_slice,
379
+ pitch_slice,
380
+ pitchf_slice,
381
+ times,
382
+ index,
383
+ big_npy,
384
+ index_rate,
385
+ version,
386
+ protect,
387
+ )[pipe.t_pad_tgt:-pipe.t_pad_tgt]
388
+ )
389
+ s = t
390
+
391
+ pitch_end_slice = pitch[
392
+ :, t // pipe.window:
393
+ ] if t is not None else pitch
394
+ pitchf_end_slice = pitchf[
395
+ :, t // pipe.window:
396
+ ] if t is not None else pitchf
397
+
398
+ audio_opt.append(
399
+ pipe.vc(
400
+ self.hu_bert_model,
401
+ net_g,
402
+ sid,
403
+ audio_pad[t:],
404
+ pitch_end_slice,
405
+ pitchf_end_slice,
406
+ times,
407
+ index,
408
+ big_npy,
409
+ index_rate,
410
+ version,
411
+ protect,
412
+ )[pipe.t_pad_tgt:-pipe.t_pad_tgt]
413
+ )
414
+
415
+ audio_opt = np.concatenate(audio_opt)
416
+ if rms_mix_rate != 1:
417
+ audio_opt = change_rms(
418
+ audio, 16000, audio_opt, tgt_sr, rms_mix_rate
419
+ )
420
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
421
+ audio_opt = librosa.resample(
422
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
423
+ )
424
+ audio_max = np.abs(audio_opt).max() / 0.99
425
+ max_int16 = 32768
426
+ if audio_max > 1:
427
+ max_int16 /= audio_max
428
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
429
+ del pitch, pitchf, sid
430
+ if torch.cuda.is_available():
431
+ torch.cuda.empty_cache()
432
+
433
+ if tgt_sr != resample_sr >= 16000:
434
+ final_sr = resample_sr
435
+ else:
436
+ final_sr = tgt_sr
437
+
438
+ """
439
+ "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
440
+ times[0],
441
+ times[1],
442
+ times[2],
443
+ ), (final_sr, audio_opt)
444
+
445
+ """
446
+
447
+ if type_output == "array":
448
+ return audio_opt, final_sr
449
+
450
+ if overwrite:
451
+ output_audio_path = input_audio_path # Overwrite
452
+ else:
453
+ basename = os.path.basename(input_audio_path)
454
+ dirname = os.path.dirname(input_audio_path)
455
+
456
+ new_basename = basename.split(
457
+ '.')[0] + "_edited." + basename.split('.')[-1]
458
+ new_path = os.path.join(dirname, new_basename)
459
+
460
+ output_audio_path = new_path
461
+
462
+ # Save file
463
+ if type_output:
464
+ output_audio_path = os.path.splitext(
465
+ output_audio_path
466
+ )[0]+f".{type_output}"
467
+
468
+ try:
469
+ sf.write(
470
+ file=output_audio_path,
471
+ samplerate=final_sr,
472
+ data=audio_opt
473
+ )
474
+ except Exception as e:
475
+ logger.error(e)
476
+ logger.error("Error saving file, trying with WAV format")
477
+ output_audio_path = os.path.splitext(output_audio_path)[0]+".wav"
478
+ sf.write(
479
+ file=output_audio_path,
480
+ samplerate=final_sr,
481
+ data=audio_opt
482
+ )
483
+
484
+ logger.info(str(output_audio_path))
485
+
486
+ self.model_config[task_id]["result"].append(output_audio_path)
487
+ self.output_list.append(output_audio_path)
488
+
489
+ def run_threads(self, threads):
490
+ # Start threads
491
+ for thread in threads:
492
+ thread.start()
493
+
494
+ # Wait for all threads to finish
495
+ for thread in threads:
496
+ thread.join()
497
+
498
+ gc.collect()
499
+ torch.cuda.empty_cache()
500
+
501
+ def unload_models(self):
502
+ self.hu_bert_model = None
503
+ self.model_pitch_estimator = None
504
+ self.model_vc = {}
505
+ self.cache_model = {}
506
+ gc.collect()
507
+ torch.cuda.empty_cache()
508
+
509
+ def __call__(
510
+ self,
511
+ audio_files=[],
512
+ tag_list=[],
513
+ overwrite=False,
514
+ parallel_workers=1,
515
+ type_output=None, # ["mp3", "wav", "ogg", "flac"]
516
+ ):
517
+ logger.info(f"Parallel workers: {str(parallel_workers)}")
518
+
519
+ self.output_list = []
520
+
521
+ if not self.model_config:
522
+ raise ValueError("No model has been configured for inference")
523
+
524
+ if isinstance(audio_files, str):
525
+ audio_files = [audio_files]
526
+ if isinstance(tag_list, str):
527
+ tag_list = [tag_list]
528
+
529
+ if not audio_files:
530
+ raise ValueError("No audio found to convert")
531
+ if not tag_list:
532
+ tag_list = [list(self.model_config.keys())[-1]] * len(audio_files)
533
+
534
+ if len(audio_files) > len(tag_list):
535
+ logger.info("Extend tag list to match audio files")
536
+ extend_number = len(audio_files) - len(tag_list)
537
+ tag_list.extend([tag_list[0]] * extend_number)
538
+
539
+ if len(audio_files) < len(tag_list):
540
+ logger.info("Cut list tags")
541
+ tag_list = tag_list[:len(audio_files)]
542
+
543
+ tag_file_pairs = list(zip(tag_list, audio_files))
544
+ sorted_tag_file = sorted(tag_file_pairs, key=lambda x: x[0])
545
+
546
+ # Base params
547
+ if not self.hu_bert_model:
548
+ self.hu_bert_model = load_hu_bert(self.config, self.hubert_path)
549
+
550
+ cache_params = None
551
+ threads = []
552
+ progress_bar = tqdm(total=len(tag_list), desc="Progress")
553
+ for i, (id_tag, input_audio_path) in enumerate(sorted_tag_file):
554
+
555
+ if id_tag not in self.model_config.keys():
556
+ logger.info(
557
+ f"No configured model for {id_tag} with {input_audio_path}"
558
+ )
559
+ continue
560
+
561
+ if (
562
+ len(threads) >= parallel_workers
563
+ or cache_params != id_tag
564
+ and cache_params is not None
565
+ ):
566
+
567
+ self.run_threads(threads)
568
+ progress_bar.update(len(threads))
569
+
570
+ threads = []
571
+
572
+ if cache_params != id_tag:
573
+
574
+ self.model_config[id_tag]["result"] = []
575
+
576
+ # Unload previous
577
+ (
578
+ n_spk,
579
+ tgt_sr,
580
+ net_g,
581
+ pipe,
582
+ cpt,
583
+ version,
584
+ if_f0,
585
+ index_rate,
586
+ index,
587
+ big_npy,
588
+ inp_f0,
589
+ ) = [None] * 11
590
+ gc.collect()
591
+ torch.cuda.empty_cache()
592
+
593
+ # Model params
594
+ params = self.model_config[id_tag]
595
+
596
+ model_path = params["file_model"]
597
+ f0_method = params["pitch_algo"]
598
+ file_index = params["file_index"]
599
+ index_rate = params["index_influence"]
600
+ f0_file = params["file_pitch_algo"]
601
+
602
+ # Load model
603
+ (
604
+ n_spk,
605
+ tgt_sr,
606
+ net_g,
607
+ pipe,
608
+ cpt,
609
+ version
610
+ ) = load_trained_model(model_path, self.config)
611
+ if_f0 = cpt.get("f0", 1) # pitch data
612
+
613
+ # Load index
614
+ if os.path.exists(file_index) and index_rate != 0:
615
+ try:
616
+ index = faiss.read_index(file_index)
617
+ big_npy = index.reconstruct_n(0, index.ntotal)
618
+ except Exception as error:
619
+ logger.error(f"Index: {str(error)}")
620
+ index_rate = 0
621
+ index = big_npy = None
622
+ else:
623
+ logger.warning("File index not found")
624
+ index_rate = 0
625
+ index = big_npy = None
626
+
627
+ # Load f0 file
628
+ inp_f0 = None
629
+ if os.path.exists(f0_file):
630
+ try:
631
+ with open(f0_file, "r") as f:
632
+ lines = f.read().strip("\n").split("\n")
633
+ inp_f0 = []
634
+ for line in lines:
635
+ inp_f0.append([float(i) for i in line.split(",")])
636
+ inp_f0 = np.array(inp_f0, dtype="float32")
637
+ except Exception as error:
638
+ logger.error(f"f0 file: {str(error)}")
639
+
640
+ if "rmvpe" in f0_method:
641
+ if not self.model_pitch_estimator:
642
+ from infer_rvc_python.lib.rmvpe import RMVPE
643
+
644
+ logger.info("Loading vocal pitch estimator model")
645
+ if self.rmvpe_path is None:
646
+ self.rmvpe_path = ""
647
+ rm_local_path = "rmvpe.pt"
648
+ if os.path.exists(self.rmvpe_path):
649
+ rm_local_path = self.rmvpe_path
650
+ self.model_pitch_estimator = RMVPE(
651
+ rm_local_path,
652
+ is_half=self.config.is_half,
653
+ device=self.config.device
654
+ )
655
+
656
+ pipe.model_rmvpe = self.model_pitch_estimator
657
+
658
+ cache_params = id_tag
659
+
660
+ # self.infer(
661
+ # id_tag,
662
+ # params,
663
+ # # load model
664
+ # n_spk,
665
+ # tgt_sr,
666
+ # net_g,
667
+ # pipe,
668
+ # cpt,
669
+ # version,
670
+ # if_f0,
671
+ # # load index
672
+ # index_rate,
673
+ # index,
674
+ # big_npy,
675
+ # # load f0 file
676
+ # inp_f0,
677
+ # # output file
678
+ # input_audio_path,
679
+ # overwrite,
680
+ # type_output,
681
+ # )
682
+
683
+ thread = threading.Thread(
684
+ target=self.infer,
685
+ args=(
686
+ id_tag,
687
+ params,
688
+ # loaded model
689
+ n_spk,
690
+ tgt_sr,
691
+ net_g,
692
+ pipe,
693
+ cpt,
694
+ version,
695
+ if_f0,
696
+ # loaded index
697
+ index_rate,
698
+ index,
699
+ big_npy,
700
+ # loaded f0 file
701
+ inp_f0,
702
+ # audio file
703
+ input_audio_path,
704
+ overwrite,
705
+ type_output,
706
+ )
707
+ )
708
+
709
+ threads.append(thread)
710
+
711
+ # Run last
712
+ if threads:
713
+ self.run_threads(threads)
714
+
715
+ progress_bar.update(len(threads))
716
+ progress_bar.close()
717
+
718
+ final_result = []
719
+ valid_tags = set(tag_list)
720
+ for tag in valid_tags:
721
+ if (
722
+ tag in self.model_config.keys()
723
+ and "result" in self.model_config[tag].keys()
724
+ ):
725
+ final_result.extend(self.model_config[tag]["result"])
726
+
727
+ return final_result
728
+
729
+ def generate_from_cache(
730
+ self,
731
+ audio_data=None, # str or tuple (<array data>,<int sampling rate>)
732
+ tag=None,
733
+ reload=False,
734
+ ):
735
+
736
+ if not self.model_config:
737
+ raise ValueError("No model has been configured for inference")
738
+
739
+ if not audio_data:
740
+ raise ValueError(
741
+ "An audio file or tuple with "
742
+ "(<numpy data audio>,<sampling rate>) is needed"
743
+ )
744
+
745
+ # Base params
746
+ if not self.hu_bert_model:
747
+ self.hu_bert_model = load_hu_bert(self.config, self.hubert_path)
748
+
749
+ if tag not in self.model_config.keys():
750
+ raise ValueError(
751
+ f"No configured model for {tag}"
752
+ )
753
+
754
+ now_data = self.model_config[tag]
755
+ now_data["tag"] = tag
756
+
757
+ if self.cache_model != now_data and not reload:
758
+
759
+ # Unload previous
760
+ self.model_vc = {}
761
+ gc.collect()
762
+ torch.cuda.empty_cache()
763
+
764
+ model_path = now_data["file_model"]
765
+ f0_method = now_data["pitch_algo"]
766
+ file_index = now_data["file_index"]
767
+ index_rate = now_data["index_influence"]
768
+ f0_file = now_data["file_pitch_algo"]
769
+
770
+ # Load model
771
+ (
772
+ self.model_vc["n_spk"],
773
+ self.model_vc["tgt_sr"],
774
+ self.model_vc["net_g"],
775
+ self.model_vc["pipe"],
776
+ self.model_vc["cpt"],
777
+ self.model_vc["version"]
778
+ ) = load_trained_model(model_path, self.config)
779
+ self.model_vc["if_f0"] = self.model_vc["cpt"].get("f0", 1)
780
+
781
+ # Load index
782
+ if os.path.exists(file_index) and index_rate != 0:
783
+ try:
784
+ index = faiss.read_index(file_index)
785
+ big_npy = index.reconstruct_n(0, index.ntotal)
786
+ except Exception as error:
787
+ logger.error(f"Index: {str(error)}")
788
+ index_rate = 0
789
+ index = big_npy = None
790
+ else:
791
+ logger.warning("File index not found")
792
+ index_rate = 0
793
+ index = big_npy = None
794
+
795
+ self.model_vc["index_rate"] = index_rate
796
+ self.model_vc["index"] = index
797
+ self.model_vc["big_npy"] = big_npy
798
+
799
+ # Load f0 file
800
+ inp_f0 = None
801
+ if os.path.exists(f0_file):
802
+ try:
803
+ with open(f0_file, "r") as f:
804
+ lines = f.read().strip("\n").split("\n")
805
+ inp_f0 = []
806
+ for line in lines:
807
+ inp_f0.append([float(i) for i in line.split(",")])
808
+ inp_f0 = np.array(inp_f0, dtype="float32")
809
+ except Exception as error:
810
+ logger.error(f"f0 file: {str(error)}")
811
+
812
+ self.model_vc["inp_f0"] = inp_f0
813
+
814
+ if "rmvpe" in f0_method:
815
+ if not self.model_pitch_estimator:
816
+ from infer_rvc_python.lib.rmvpe import RMVPE
817
+
818
+ logger.info("Loading vocal pitch estimator model")
819
+ if self.rmvpe_path is None:
820
+ self.rmvpe_path = ""
821
+ rm_local_path = "rmvpe.pt"
822
+ if os.path.exists(self.rmvpe_path):
823
+ rm_local_path = self.rmvpe_path
824
+ self.model_pitch_estimator = RMVPE(
825
+ rm_local_path,
826
+ is_half=self.config.is_half,
827
+ device=self.config.device
828
+ )
829
+
830
+ self.model_vc["pipe"].model_rmvpe = self.model_pitch_estimator
831
+
832
+ self.cache_model = copy.deepcopy(now_data)
833
+
834
+ return self.infer(
835
+ tag,
836
+ now_data,
837
+ # load model
838
+ self.model_vc["n_spk"],
839
+ self.model_vc["tgt_sr"],
840
+ self.model_vc["net_g"],
841
+ self.model_vc["pipe"],
842
+ self.model_vc["cpt"],
843
+ self.model_vc["version"],
844
+ self.model_vc["if_f0"],
845
+ # load index
846
+ self.model_vc["index_rate"],
847
+ self.model_vc["index"],
848
+ self.model_vc["big_npy"],
849
+ # load f0 file
850
+ self.model_vc["inp_f0"],
851
+ # output file
852
+ audio_data,
853
+ False,
854
+ "array",
855
+ )