Katock commited on
Commit
c7182d9
·
1 Parent(s): 3d4653f
Files changed (2) hide show
  1. app.py +23 -10
  2. inference/infer_tool.py +124 -107
app.py CHANGED
@@ -9,6 +9,7 @@ import librosa
9
  import numpy as np
10
  import soundfile
11
 
 
12
  from inference.infer_tool import Svc
13
 
14
  logging.getLogger('numba').setLevel(logging.WARNING)
@@ -31,7 +32,7 @@ def audio_postprocess(self, y):
31
  gr.Audio.postprocess = audio_postprocess
32
 
33
 
34
- def create_vc_fn(model, sid):
35
  def vc_fn(input_audio, vc_transform, auto_f0):
36
  if input_audio is None:
37
  return "请先上传音频", None
@@ -39,17 +40,29 @@ def create_vc_fn(model, sid):
39
  duration = audio.shape[0] / sampling_rate
40
  if duration > 20 and limitation:
41
  return "请上传小于20秒的音频,或点击右上角裁剪", None
42
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
43
- if len(audio.shape) > 1:
44
- audio = librosa.to_mono(audio.transpose(1, 0))
45
- if sampling_rate != 16000:
46
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
 
47
  raw_path = io.BytesIO()
48
- soundfile.write(raw_path, audio, 16000, format="wav")
49
  raw_path.seek(0)
50
- out_audio, out_sr = model.infer(sid, vc_transform, raw_path,
51
- auto_predict_f0=auto_f0,
52
- )
 
 
 
 
 
 
 
 
 
 
 
53
  return "Success", (44100, out_audio.cpu().numpy())
54
 
55
  return vc_fn
 
9
  import numpy as np
10
  import soundfile
11
 
12
+ from inference import infer_tool
13
  from inference.infer_tool import Svc
14
 
15
  logging.getLogger('numba').setLevel(logging.WARNING)
 
32
  gr.Audio.postprocess = audio_postprocess
33
 
34
 
35
+ def create_vc_fn(model, spk):
36
  def vc_fn(input_audio, vc_transform, auto_f0):
37
  if input_audio is None:
38
  return "请先上传音频", None
 
40
  duration = audio.shape[0] / sampling_rate
41
  if duration > 20 and limitation:
42
  return "请上传小于20秒的音频,或点击右上角裁剪", None
43
+
44
+ # audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
45
+ # if len(audio.shape) > 1:
46
+ # audio = librosa.to_mono(audio.transpose(1, 0))
47
+ # if sampling_rate != 16000:
48
+ # audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
49
  raw_path = io.BytesIO()
50
+ soundfile.write(raw_path, audio, sampling_rate, format="wav")
51
  raw_path.seek(0)
52
+ if "." not in raw_path:
53
+ raw_path += ".wav"
54
+ print("path: ", raw_path)
55
+ infer_tool.format_wav(raw_path)
56
+ # out_audio, out_sr, _ = model.infer(spk, vc_transform, raw_path,
57
+ # auto_predict_f0=auto_f0,
58
+ # )
59
+ out_audio = model.slice_inference(raw_audio_path=raw_path,
60
+ spk=spk,
61
+ tran=vc_transform,
62
+ slice_db=-40,
63
+ cluster_infer_ratio=0,
64
+ auto_predict_f0=auto_f0,
65
+ noice_scale=0.4)
66
  return "Success", (44100, out_audio.cpu().numpy())
67
 
68
  return vc_fn
inference/infer_tool.py CHANGED
@@ -85,16 +85,19 @@ def get_end_file(dir_path, end):
85
  def get_md5(content):
86
  return hashlib.new("md5", content).hexdigest()
87
 
 
88
  def fill_a_to_b(a, b):
89
  if len(a) < len(b):
90
  for _ in range(0, len(b) - len(a)):
91
  a.append(a[0])
92
 
 
93
  def mkdir(paths: list):
94
  for path in paths:
95
  if not os.path.exists(path):
96
  os.mkdir(path)
97
 
 
98
  def pad_array(arr, target_length):
99
  current_length = arr.shape[0]
100
  if current_length >= target_length:
@@ -105,26 +108,28 @@ def pad_array(arr, target_length):
105
  pad_right = pad_width - pad_left
106
  padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
107
  return padded_arr
108
-
 
109
  def split_list_by_n(list_collection, n, pre=0):
110
  for i in range(0, len(list_collection), n):
111
- yield list_collection[i-pre if i-pre>=0 else i: i + n]
112
 
113
 
114
  class F0FilterException(Exception):
115
  pass
116
 
 
117
  class Svc(object):
118
  def __init__(self, net_g_path, config_path,
119
  device=None,
120
  cluster_model_path="logs/44k/kmeans_10000.pt",
121
- nsf_hifigan_enhance = False,
122
  diffusion_model_path="logs/44k/diffusion/model_0.pt",
123
  diffusion_config_path="configs/diffusion.yaml",
124
- shallow_diffusion = False,
125
- only_diffusion = False,
126
- spk_mix_enable = False,
127
- feature_retrieval = False
128
  ):
129
  self.net_g_path = net_g_path
130
  self.only_diffusion = only_diffusion
@@ -152,7 +157,9 @@ class Svc(object):
152
  self.nsf_hifigan_enhance = nsf_hifigan_enhance
153
  if self.shallow_diffusion or self.only_diffusion:
154
  if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
155
- self.diffusion_model,self.vocoder,self.diffusion_args = load_model_vocoder(diffusion_model_path,self.dev,config_path=diffusion_config_path)
 
 
156
  if self.only_diffusion:
157
  self.target_sample = self.diffusion_args.data.sampling_rate
158
  self.hop_size = self.diffusion_args.data.block_size
@@ -163,32 +170,32 @@ class Svc(object):
163
  else:
164
  print("No diffusion model or config found. Shallow diffusion mode will False")
165
  self.shallow_diffusion = self.only_diffusion = False
166
-
167
  # load hubert and model
168
  if not self.only_diffusion:
169
  self.load_model(spk_mix_enable)
170
- self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
171
  self.volume_extractor = utils.Volume_Extractor(self.hop_size)
172
  else:
173
- self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder,device=self.dev)
174
  self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
175
-
176
  if os.path.exists(cluster_model_path):
177
  if self.feature_retrieval:
178
- with open(cluster_model_path,"rb") as f:
179
  self.cluster_model = pickle.load(f)
180
  self.big_npy = None
181
  self.now_spk_id = -1
182
  else:
183
  self.cluster_model = cluster.get_cluster_model(cluster_model_path)
184
  else:
185
- self.feature_retrieval=False
186
 
187
- if self.shallow_diffusion : self.nsf_hifigan_enhance = False
188
  if self.nsf_hifigan_enhance:
189
  from modules.enhancer import Enhancer
190
- self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
191
-
192
  def load_model(self, spk_mix_enable=False):
193
  # get model configuration
194
  self.net_g_ms = SynthesizerTrn(
@@ -203,10 +210,12 @@ class Svc(object):
203
  if spk_mix_enable:
204
  self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
205
 
206
- def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
 
 
 
 
207
 
208
- f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
209
-
210
  f0, uv = f0_predictor_object.compute_f0_uv(wav)
211
  if f0_filter and sum(f0) == 0:
212
  raise F0FilterException("No voice detected")
@@ -222,7 +231,7 @@ class Svc(object):
222
  c = self.hubert_model.encoder(wav16k)
223
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
224
 
225
- if cluster_infer_ratio !=0:
226
  if self.feature_retrieval:
227
  speaker_id = self.spk2id.get(speaker)
228
  if speaker_id is None:
@@ -231,17 +240,17 @@ class Svc(object):
231
  if len(self.spk2id.__dict__) >= speaker:
232
  speaker_id = speaker
233
  feature_index = self.cluster_model[speaker_id]
234
- feat_np = c.transpose(0,1).cpu().numpy()
235
  if self.big_npy is None or self.now_spk_id != speaker_id:
236
- self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
237
- self.now_spk_id = speaker_id
238
  print("starting feature retrieval...")
239
  score, ix = feature_index.search(feat_np, k=8)
240
  weight = np.square(1 / score)
241
  weight /= weight.sum(axis=1, keepdims=True)
242
  npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
243
  c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
244
- c = torch.FloatTensor(c).to(self.dev).transpose(0,1)
245
  print("end feature retrieval...")
246
  else:
247
  cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
@@ -257,19 +266,19 @@ class Svc(object):
257
  noice_scale=0.4,
258
  f0_filter=False,
259
  f0_predictor='pm',
260
- enhancer_adaptive_key = 0,
261
- cr_threshold = 0.05,
262
- k_step = 100,
263
- frame = 0,
264
- spk_mix = False,
265
- second_encoding = False,
266
- loudness_envelope_adjustment = 1
267
  ):
268
  wav, sr = librosa.load(raw_path, sr=self.target_sample)
269
  if spk_mix:
270
- c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
271
  n_frames = f0.size(1)
272
- sid = speaker[:, frame:frame+n_frames].transpose(0,1)
273
  else:
274
  speaker_id = self.spk2id.get(speaker)
275
  if not speaker_id and type(speaker) is int:
@@ -278,7 +287,8 @@ class Svc(object):
278
  if speaker_id is None:
279
  raise RuntimeError("The name you entered is not in the speaker list!")
280
  sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
281
- c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
 
282
  n_frames = f0.size(1)
283
  if "half" in self.net_g_path and torch.cuda.is_available():
284
  c = c.half()
@@ -286,43 +296,50 @@ class Svc(object):
286
  start = time.time()
287
  vol = None
288
  if not self.only_diffusion:
289
- vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
290
- audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
291
- audio = audio[0,0].data.float()
292
- audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
 
 
293
  else:
294
  audio = torch.FloatTensor(wav).to(self.dev)
295
  audio_mel = None
296
  if self.only_diffusion or self.shallow_diffusion:
297
- vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol==None else vol[:,:,None]
 
 
 
298
  if self.shallow_diffusion and second_encoding:
299
- audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
 
300
  audio16k = torch.from_numpy(audio16k).to(self.dev)
301
  c = self.hubert_model.encoder(audio16k)
302
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
303
- f0 = f0[:,:,None]
304
- c = c.transpose(-1,-2)
305
  audio_mel = self.diffusion_model(
306
- c,
307
- f0,
308
- vol,
309
- spk_id = sid,
310
- spk_mix_dict = None,
311
- gt_spec=audio_mel,
312
- infer=True,
313
- infer_speedup=self.diffusion_args.infer.speedup,
314
- method=self.diffusion_args.infer.method,
315
- k_step=k_step)
316
  audio = self.vocoder.infer(audio_mel, f0).squeeze()
317
  if self.nsf_hifigan_enhance:
318
  audio, _ = self.enhancer.enhance(
319
- audio[None,:],
320
- self.target_sample,
321
- f0[:,:,None],
322
- self.hps_ms.data.hop_length,
323
- adaptive_key = enhancer_adaptive_key)
324
  if loudness_envelope_adjustment != 1:
325
- audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
 
326
  use_time = time.time() - start
327
  print("vits use time:{}".format(use_time))
328
  return audio, audio.shape[-1], n_frames
@@ -335,7 +352,7 @@ class Svc(object):
335
  # unload model
336
  self.net_g_ms = self.net_g_ms.to("cpu")
337
  del self.net_g_ms
338
- if hasattr(self,"enhancer"):
339
  self.enhancer.enhancer = self.enhancer.enhancer.to("cpu")
340
  del self.enhancer.enhancer
341
  del self.enhancer
@@ -352,14 +369,14 @@ class Svc(object):
352
  pad_seconds=0.5,
353
  clip_seconds=0,
354
  lg_num=0,
355
- lgr_num =0.75,
356
  f0_predictor='pm',
357
- enhancer_adaptive_key = 0,
358
- cr_threshold = 0.05,
359
- k_step = 100,
360
- use_spk_mix = False,
361
- second_encoding = False,
362
- loudness_envelope_adjustment = 1
363
  ):
364
  if use_spk_mix:
365
  if len(self.spk2id) == 1:
@@ -368,12 +385,12 @@ class Svc(object):
368
  wav_path = Path(raw_audio_path).with_suffix('.wav')
369
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
370
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
371
- per_size = int(clip_seconds*audio_sr)
372
- lg_size = int(lg_num*audio_sr)
373
- lg_size_r = int(lg_size*lgr_num)
374
- lg_size_c_l = (lg_size-lg_size_r)//2
375
- lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
376
- lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
377
 
378
  if use_spk_mix:
379
  assert len(self.spk2id) == len(spk)
@@ -384,10 +401,10 @@ class Svc(object):
384
  audio_length += aud_length // self.hop_size
385
  continue
386
  if per_size != 0:
387
- datas = split_list_by_n(data, per_size,lg_size)
388
  else:
389
  datas = [data]
390
- for k,dat in enumerate(datas):
391
  pad_len = int(audio_sr * pad_seconds)
392
  per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
393
  a_length = per_length + 2 * pad_len
@@ -397,14 +414,14 @@ class Svc(object):
397
  for i in range(len(spk)):
398
  last_end = None
399
  for mix in spk[i]:
400
- if mix[3]<0. or mix[2]<0.:
401
  raise RuntimeError("mix value must higer Than zero!")
402
  begin = int(audio_length * mix[0])
403
  end = int(audio_length * mix[1])
404
  length = end - begin
405
- if length<=0:
406
  raise RuntimeError("begin Must lower Than end!")
407
- step = (mix[3] - mix[2])/length
408
  if last_end is not None:
409
  if last_end != begin:
410
  raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
@@ -412,20 +429,20 @@ class Svc(object):
412
  if step == 0.:
413
  spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
414
  else:
415
- spk_mix_data = torch.arange(mix[2],mix[3],step).to(self.dev)
416
- if(len(spk_mix_data)<length):
417
  num_pad = length - len(spk_mix_data)
418
  spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
419
  spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
420
 
421
- spk_mix_ten = torch.sum(spk_mix_tensor,dim=0).unsqueeze(0).to(self.dev)
422
  # spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
423
  for i, x in enumerate(spk_mix_ten[0]):
424
  if x == 0.0:
425
  spk_mix_ten[0][i] = 1.0
426
- spk_mix_tensor[:,i] = 1.0 / len(spk)
427
  spk_mix_tensor = spk_mix_tensor / spk_mix_ten
428
- if not ((torch.sum(spk_mix_tensor,dim=0) - 1.)<0.0001).all():
429
  raise RuntimeError("sum(spk_mix_tensor) not equal 1")
430
  spk = spk_mix_tensor
431
 
@@ -442,12 +459,12 @@ class Svc(object):
442
  global_frame += length // self.hop_size
443
  continue
444
  if per_size != 0:
445
- datas = split_list_by_n(data, per_size,lg_size)
446
  else:
447
  datas = [data]
448
- for k,dat in enumerate(datas):
449
- per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
450
- if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
451
  # padd
452
  pad_len = int(audio_sr * pad_seconds)
453
  dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
@@ -455,33 +472,34 @@ class Svc(object):
455
  soundfile.write(raw_path, dat, audio_sr, format="wav")
456
  raw_path.seek(0)
457
  out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
458
- cluster_infer_ratio=cluster_infer_ratio,
459
- auto_predict_f0=auto_predict_f0,
460
- noice_scale=noice_scale,
461
- f0_predictor = f0_predictor,
462
- enhancer_adaptive_key = enhancer_adaptive_key,
463
- cr_threshold = cr_threshold,
464
- k_step = k_step,
465
- frame = global_frame,
466
- spk_mix = use_spk_mix,
467
- second_encoding = second_encoding,
468
- loudness_envelope_adjustment = loudness_envelope_adjustment
469
- )
470
  global_frame += out_frame
471
  _audio = out_audio.cpu().numpy()
472
  pad_len = int(self.target_sample * pad_seconds)
473
  _audio = _audio[pad_len:-pad_len]
474
  _audio = pad_array(_audio, per_length)
475
- if lg_size!=0 and k!=0:
476
- lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
477
- lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
478
- lg_pre = lg1*(1-lg)+lg2*lg
479
- audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
480
  audio.extend(lg_pre)
481
- _audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
482
  audio.extend(list(_audio))
483
  return np.array(audio)
484
 
 
485
  class RealTimeVC:
486
  def __init__(self):
487
  self.last_chunk = None
@@ -509,7 +527,7 @@ class RealTimeVC:
509
  auto_predict_f0=auto_predict_f0,
510
  noice_scale=noice_scale,
511
  f0_filter=f0_filter)
512
-
513
  audio = audio.cpu().numpy()
514
  self.last_chunk = audio[-self.pre_len:]
515
  self.last_o = audio
@@ -530,4 +548,3 @@ class RealTimeVC:
530
  self.last_chunk = audio[-self.pre_len:]
531
  self.last_o = audio
532
  return ret[self.chunk_len:2 * self.chunk_len]
533
-
 
85
  def get_md5(content):
86
  return hashlib.new("md5", content).hexdigest()
87
 
88
+
89
  def fill_a_to_b(a, b):
90
  if len(a) < len(b):
91
  for _ in range(0, len(b) - len(a)):
92
  a.append(a[0])
93
 
94
+
95
  def mkdir(paths: list):
96
  for path in paths:
97
  if not os.path.exists(path):
98
  os.mkdir(path)
99
 
100
+
101
  def pad_array(arr, target_length):
102
  current_length = arr.shape[0]
103
  if current_length >= target_length:
 
108
  pad_right = pad_width - pad_left
109
  padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
110
  return padded_arr
111
+
112
+
113
  def split_list_by_n(list_collection, n, pre=0):
114
  for i in range(0, len(list_collection), n):
115
+ yield list_collection[i - pre if i - pre >= 0 else i: i + n]
116
 
117
 
118
  class F0FilterException(Exception):
119
  pass
120
 
121
+
122
  class Svc(object):
123
  def __init__(self, net_g_path, config_path,
124
  device=None,
125
  cluster_model_path="logs/44k/kmeans_10000.pt",
126
+ nsf_hifigan_enhance=False,
127
  diffusion_model_path="logs/44k/diffusion/model_0.pt",
128
  diffusion_config_path="configs/diffusion.yaml",
129
+ shallow_diffusion=False,
130
+ only_diffusion=False,
131
+ spk_mix_enable=False,
132
+ feature_retrieval=False
133
  ):
134
  self.net_g_path = net_g_path
135
  self.only_diffusion = only_diffusion
 
157
  self.nsf_hifigan_enhance = nsf_hifigan_enhance
158
  if self.shallow_diffusion or self.only_diffusion:
159
  if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
160
+ self.diffusion_model, self.vocoder, self.diffusion_args = load_model_vocoder(diffusion_model_path,
161
+ self.dev,
162
+ config_path=diffusion_config_path)
163
  if self.only_diffusion:
164
  self.target_sample = self.diffusion_args.data.sampling_rate
165
  self.hop_size = self.diffusion_args.data.block_size
 
170
  else:
171
  print("No diffusion model or config found. Shallow diffusion mode will False")
172
  self.shallow_diffusion = self.only_diffusion = False
173
+
174
  # load hubert and model
175
  if not self.only_diffusion:
176
  self.load_model(spk_mix_enable)
177
+ self.hubert_model = utils.get_speech_encoder(self.speech_encoder, device=self.dev)
178
  self.volume_extractor = utils.Volume_Extractor(self.hop_size)
179
  else:
180
+ self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder, device=self.dev)
181
  self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
182
+
183
  if os.path.exists(cluster_model_path):
184
  if self.feature_retrieval:
185
+ with open(cluster_model_path, "rb") as f:
186
  self.cluster_model = pickle.load(f)
187
  self.big_npy = None
188
  self.now_spk_id = -1
189
  else:
190
  self.cluster_model = cluster.get_cluster_model(cluster_model_path)
191
  else:
192
+ self.feature_retrieval = False
193
 
194
+ if self.shallow_diffusion: self.nsf_hifigan_enhance = False
195
  if self.nsf_hifigan_enhance:
196
  from modules.enhancer import Enhancer
197
+ self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model', device=self.dev)
198
+
199
  def load_model(self, spk_mix_enable=False):
200
  # get model configuration
201
  self.net_g_ms = SynthesizerTrn(
 
210
  if spk_mix_enable:
211
  self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
212
 
213
+ def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter, f0_predictor, cr_threshold=0.05):
214
+
215
+ f0_predictor_object = utils.get_f0_predictor(f0_predictor, hop_length=self.hop_size,
216
+ sampling_rate=self.target_sample, device=self.dev,
217
+ threshold=cr_threshold)
218
 
 
 
219
  f0, uv = f0_predictor_object.compute_f0_uv(wav)
220
  if f0_filter and sum(f0) == 0:
221
  raise F0FilterException("No voice detected")
 
231
  c = self.hubert_model.encoder(wav16k)
232
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
233
 
234
+ if cluster_infer_ratio != 0:
235
  if self.feature_retrieval:
236
  speaker_id = self.spk2id.get(speaker)
237
  if speaker_id is None:
 
240
  if len(self.spk2id.__dict__) >= speaker:
241
  speaker_id = speaker
242
  feature_index = self.cluster_model[speaker_id]
243
+ feat_np = c.transpose(0, 1).cpu().numpy()
244
  if self.big_npy is None or self.now_spk_id != speaker_id:
245
+ self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
246
+ self.now_spk_id = speaker_id
247
  print("starting feature retrieval...")
248
  score, ix = feature_index.search(feat_np, k=8)
249
  weight = np.square(1 / score)
250
  weight /= weight.sum(axis=1, keepdims=True)
251
  npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
252
  c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
253
+ c = torch.FloatTensor(c).to(self.dev).transpose(0, 1)
254
  print("end feature retrieval...")
255
  else:
256
  cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
 
266
  noice_scale=0.4,
267
  f0_filter=False,
268
  f0_predictor='pm',
269
+ enhancer_adaptive_key=0,
270
+ cr_threshold=0.05,
271
+ k_step=100,
272
+ frame=0,
273
+ spk_mix=False,
274
+ second_encoding=False,
275
+ loudness_envelope_adjustment=1
276
  ):
277
  wav, sr = librosa.load(raw_path, sr=self.target_sample)
278
  if spk_mix:
279
+ c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter, f0_predictor, cr_threshold=cr_threshold)
280
  n_frames = f0.size(1)
281
+ sid = speaker[:, frame:frame + n_frames].transpose(0, 1)
282
  else:
283
  speaker_id = self.spk2id.get(speaker)
284
  if not speaker_id and type(speaker) is int:
 
287
  if speaker_id is None:
288
  raise RuntimeError("The name you entered is not in the speaker list!")
289
  sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
290
+ c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter, f0_predictor,
291
+ cr_threshold=cr_threshold)
292
  n_frames = f0.size(1)
293
  if "half" in self.net_g_path and torch.cuda.is_available():
294
  c = c.half()
 
296
  start = time.time()
297
  vol = None
298
  if not self.only_diffusion:
299
+ vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None, :])[None, :].to(
300
+ self.dev) if self.vol_embedding else None
301
+ audio, f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0,
302
+ noice_scale=noice_scale, vol=vol)
303
+ audio = audio[0, 0].data.float()
304
+ audio_mel = self.vocoder.extract(audio[None, :], self.target_sample) if self.shallow_diffusion else None
305
  else:
306
  audio = torch.FloatTensor(wav).to(self.dev)
307
  audio_mel = None
308
  if self.only_diffusion or self.shallow_diffusion:
309
+ vol = self.volume_extractor.extract(audio[None, :])[None, :, None].to(self.dev) if vol == None else vol[
310
+ :,
311
+ :,
312
+ None]
313
  if self.shallow_diffusion and second_encoding:
314
+ audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample,
315
+ target_sr=16000)
316
  audio16k = torch.from_numpy(audio16k).to(self.dev)
317
  c = self.hubert_model.encoder(audio16k)
318
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
319
+ f0 = f0[:, :, None]
320
+ c = c.transpose(-1, -2)
321
  audio_mel = self.diffusion_model(
322
+ c,
323
+ f0,
324
+ vol,
325
+ spk_id=sid,
326
+ spk_mix_dict=None,
327
+ gt_spec=audio_mel,
328
+ infer=True,
329
+ infer_speedup=self.diffusion_args.infer.speedup,
330
+ method=self.diffusion_args.infer.method,
331
+ k_step=k_step)
332
  audio = self.vocoder.infer(audio_mel, f0).squeeze()
333
  if self.nsf_hifigan_enhance:
334
  audio, _ = self.enhancer.enhance(
335
+ audio[None, :],
336
+ self.target_sample,
337
+ f0[:, :, None],
338
+ self.hps_ms.data.hop_length,
339
+ adaptive_key=enhancer_adaptive_key)
340
  if loudness_envelope_adjustment != 1:
341
+ audio = utils.change_rms(wav, self.target_sample, audio, self.target_sample,
342
+ loudness_envelope_adjustment)
343
  use_time = time.time() - start
344
  print("vits use time:{}".format(use_time))
345
  return audio, audio.shape[-1], n_frames
 
352
  # unload model
353
  self.net_g_ms = self.net_g_ms.to("cpu")
354
  del self.net_g_ms
355
+ if hasattr(self, "enhancer"):
356
  self.enhancer.enhancer = self.enhancer.enhancer.to("cpu")
357
  del self.enhancer.enhancer
358
  del self.enhancer
 
369
  pad_seconds=0.5,
370
  clip_seconds=0,
371
  lg_num=0,
372
+ lgr_num=0.75,
373
  f0_predictor='pm',
374
+ enhancer_adaptive_key=0,
375
+ cr_threshold=0.05,
376
+ k_step=100,
377
+ use_spk_mix=False,
378
+ second_encoding=False,
379
+ loudness_envelope_adjustment=1
380
  ):
381
  if use_spk_mix:
382
  if len(self.spk2id) == 1:
 
385
  wav_path = Path(raw_audio_path).with_suffix('.wav')
386
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
387
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
388
+ per_size = int(clip_seconds * audio_sr)
389
+ lg_size = int(lg_num * audio_sr)
390
+ lg_size_r = int(lg_size * lgr_num)
391
+ lg_size_c_l = (lg_size - lg_size_r) // 2
392
+ lg_size_c_r = lg_size - lg_size_r - lg_size_c_l
393
+ lg = np.linspace(0, 1, lg_size_r) if lg_size != 0 else 0
394
 
395
  if use_spk_mix:
396
  assert len(self.spk2id) == len(spk)
 
401
  audio_length += aud_length // self.hop_size
402
  continue
403
  if per_size != 0:
404
+ datas = split_list_by_n(data, per_size, lg_size)
405
  else:
406
  datas = [data]
407
+ for k, dat in enumerate(datas):
408
  pad_len = int(audio_sr * pad_seconds)
409
  per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
410
  a_length = per_length + 2 * pad_len
 
414
  for i in range(len(spk)):
415
  last_end = None
416
  for mix in spk[i]:
417
+ if mix[3] < 0. or mix[2] < 0.:
418
  raise RuntimeError("mix value must higer Than zero!")
419
  begin = int(audio_length * mix[0])
420
  end = int(audio_length * mix[1])
421
  length = end - begin
422
+ if length <= 0:
423
  raise RuntimeError("begin Must lower Than end!")
424
+ step = (mix[3] - mix[2]) / length
425
  if last_end is not None:
426
  if last_end != begin:
427
  raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
 
429
  if step == 0.:
430
  spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
431
  else:
432
+ spk_mix_data = torch.arange(mix[2], mix[3], step).to(self.dev)
433
+ if (len(spk_mix_data) < length):
434
  num_pad = length - len(spk_mix_data)
435
  spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
436
  spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
437
 
438
+ spk_mix_ten = torch.sum(spk_mix_tensor, dim=0).unsqueeze(0).to(self.dev)
439
  # spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
440
  for i, x in enumerate(spk_mix_ten[0]):
441
  if x == 0.0:
442
  spk_mix_ten[0][i] = 1.0
443
+ spk_mix_tensor[:, i] = 1.0 / len(spk)
444
  spk_mix_tensor = spk_mix_tensor / spk_mix_ten
445
+ if not ((torch.sum(spk_mix_tensor, dim=0) - 1.) < 0.0001).all():
446
  raise RuntimeError("sum(spk_mix_tensor) not equal 1")
447
  spk = spk_mix_tensor
448
 
 
459
  global_frame += length // self.hop_size
460
  continue
461
  if per_size != 0:
462
+ datas = split_list_by_n(data, per_size, lg_size)
463
  else:
464
  datas = [data]
465
+ for k, dat in enumerate(datas):
466
+ per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds != 0 else length
467
+ if clip_seconds != 0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
468
  # padd
469
  pad_len = int(audio_sr * pad_seconds)
470
  dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
 
472
  soundfile.write(raw_path, dat, audio_sr, format="wav")
473
  raw_path.seek(0)
474
  out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
475
+ cluster_infer_ratio=cluster_infer_ratio,
476
+ auto_predict_f0=auto_predict_f0,
477
+ noice_scale=noice_scale,
478
+ f0_predictor=f0_predictor,
479
+ enhancer_adaptive_key=enhancer_adaptive_key,
480
+ cr_threshold=cr_threshold,
481
+ k_step=k_step,
482
+ frame=global_frame,
483
+ spk_mix=use_spk_mix,
484
+ second_encoding=second_encoding,
485
+ loudness_envelope_adjustment=loudness_envelope_adjustment
486
+ )
487
  global_frame += out_frame
488
  _audio = out_audio.cpu().numpy()
489
  pad_len = int(self.target_sample * pad_seconds)
490
  _audio = _audio[pad_len:-pad_len]
491
  _audio = pad_array(_audio, per_length)
492
+ if lg_size != 0 and k != 0:
493
+ lg1 = audio[-(lg_size_r + lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
494
+ lg2 = _audio[lg_size_c_l:lg_size_c_l + lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
495
+ lg_pre = lg1 * (1 - lg) + lg2 * lg
496
+ audio = audio[0:-(lg_size_r + lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
497
  audio.extend(lg_pre)
498
+ _audio = _audio[lg_size_c_l + lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
499
  audio.extend(list(_audio))
500
  return np.array(audio)
501
 
502
+
503
  class RealTimeVC:
504
  def __init__(self):
505
  self.last_chunk = None
 
527
  auto_predict_f0=auto_predict_f0,
528
  noice_scale=noice_scale,
529
  f0_filter=f0_filter)
530
+
531
  audio = audio.cpu().numpy()
532
  self.last_chunk = audio[-self.pre_len:]
533
  self.last_o = audio
 
548
  self.last_chunk = audio[-self.pre_len:]
549
  self.last_o = audio
550
  return ret[self.chunk_len:2 * self.chunk_len]