Katock commited on
Commit
16c3ddb
·
1 Parent(s): 533346a
inference/infer_tool.py CHANGED
@@ -1,12 +1,12 @@
1
- import gc
2
  import hashlib
3
  import io
4
  import json
5
  import logging
6
  import os
7
- import pickle
8
  import time
9
  from pathlib import Path
 
 
10
 
11
  import librosa
12
  import numpy as np
@@ -17,8 +17,11 @@ import torchaudio
17
 
18
  import cluster
19
  import utils
20
- from inference import slicer
21
  from models import SynthesizerTrn
 
 
 
 
22
 
23
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
24
 
@@ -82,19 +85,16 @@ def get_end_file(dir_path, end):
82
  def get_md5(content):
83
  return hashlib.new("md5", content).hexdigest()
84
 
85
-
86
  def fill_a_to_b(a, b):
87
  if len(a) < len(b):
88
  for _ in range(0, len(b) - len(a)):
89
  a.append(a[0])
90
 
91
-
92
  def mkdir(paths: list):
93
  for path in paths:
94
  if not os.path.exists(path):
95
  os.mkdir(path)
96
 
97
-
98
  def pad_array(arr, target_length):
99
  current_length = arr.shape[0]
100
  if current_length >= target_length:
@@ -105,55 +105,90 @@ def pad_array(arr, target_length):
105
  pad_right = pad_width - pad_left
106
  padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
107
  return padded_arr
108
-
109
-
110
  def split_list_by_n(list_collection, n, pre=0):
111
  for i in range(0, len(list_collection), n):
112
- yield list_collection[i - pre if i - pre >= 0 else i: i + n]
113
 
114
 
115
  class F0FilterException(Exception):
116
  pass
117
 
118
-
119
  class Svc(object):
120
  def __init__(self, net_g_path, config_path,
121
  device=None,
122
- cluster_model_path="logs/44k/kmeans_10000.pt"):
 
 
 
 
 
 
 
 
123
  self.net_g_path = net_g_path
 
 
 
124
  if device is None:
125
  self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
126
  else:
127
  self.dev = torch.device(device)
128
  self.net_g_ms = None
129
- self.hps_ms = utils.get_hparams_from_file(config_path)
130
- self.target_sample = self.hps_ms.data.sampling_rate
131
- self.hop_size = self.hps_ms.data.hop_length
132
- self.spk2id = self.hps_ms.spk
133
- try:
134
- self.vol_embedding = self.hps_ms.model.vol_embedding
135
- except Exception as e:
136
- self.vol_embedding = False
137
- try:
138
- self.speech_encoder = self.hps_ms.model.speech_encoder
139
- except Exception as e:
140
- self.speech_encoder = 'vec768l12'
141
-
142
- self.hubert_model = utils.get_hubert_model().to(self.dev)
143
- self.load_model()
144
- self.volume_extractor = utils.Volume_Extractor(self.hop_size)
145
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  if os.path.exists(cluster_model_path):
147
  if self.feature_retrieval:
148
- with open(cluster_model_path, "rb") as f:
149
  self.cluster_model = pickle.load(f)
150
  self.big_npy = None
151
  self.now_spk_id = -1
152
  else:
153
  self.cluster_model = cluster.get_cluster_model(cluster_model_path)
154
  else:
155
- self.feature_retrieval = False
156
 
 
 
 
 
 
157
  def load_model(self, spk_mix_enable=False):
158
  # get model configuration
159
  self.net_g_ms = SynthesizerTrn(
@@ -168,12 +203,10 @@ class Svc(object):
168
  if spk_mix_enable:
169
  self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
170
 
171
- def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter, f0_predictor, cr_threshold=0.05):
172
-
173
- f0_predictor_object = utils.get_f0_predictor(f0_predictor, hop_length=self.hop_size,
174
- sampling_rate=self.target_sample, device=self.dev,
175
- threshold=cr_threshold)
176
 
 
 
177
  f0, uv = f0_predictor_object.compute_f0_uv(wav)
178
  if f0_filter and sum(f0) == 0:
179
  raise F0FilterException("No voice detected")
@@ -187,10 +220,9 @@ class Svc(object):
187
  wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
188
  wav16k = torch.from_numpy(wav16k).to(self.dev)
189
  c = self.hubert_model.encoder(wav16k)
190
- # c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k)
191
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
192
 
193
- if cluster_infer_ratio != 0:
194
  if self.feature_retrieval:
195
  speaker_id = self.spk2id.get(speaker)
196
  if speaker_id is None:
@@ -199,17 +231,17 @@ class Svc(object):
199
  if len(self.spk2id.__dict__) >= speaker:
200
  speaker_id = speaker
201
  feature_index = self.cluster_model[speaker_id]
202
- feat_np = c.transpose(0, 1).cpu().numpy()
203
  if self.big_npy is None or self.now_spk_id != speaker_id:
204
- self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
205
- self.now_spk_id = speaker_id
206
  print("starting feature retrieval...")
207
  score, ix = feature_index.search(feat_np, k=8)
208
  weight = np.square(1 / score)
209
  weight /= weight.sum(axis=1, keepdims=True)
210
  npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
211
  c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
212
- c = torch.FloatTensor(c).to(self.dev).transpose(0, 1)
213
  print("end feature retrieval...")
214
  else:
215
  cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
@@ -225,35 +257,72 @@ class Svc(object):
225
  noice_scale=0.4,
226
  f0_filter=False,
227
  f0_predictor='pm',
228
- enhancer_adaptive_key=0,
229
- cr_threshold=0.05,
230
- k_step=100,
231
- frame=0,
232
- spk_mix=False,
233
- second_encoding=False,
234
- loudness_envelope_adjustment=1
235
  ):
236
  wav, sr = librosa.load(raw_path, sr=self.target_sample)
237
- speaker_id = self.spk2id.get(speaker)
238
- if not speaker_id and type(speaker) is int:
239
- if len(self.spk2id.__dict__) >= speaker:
240
- speaker_id = speaker
241
- if speaker_id is None:
242
- raise RuntimeError("The name you entered is not in the speaker list!")
243
- sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
244
- c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter, f0_predictor,
245
- cr_threshold=cr_threshold)
246
- n_frames = f0.size(1)
 
 
 
 
247
  if "half" in self.net_g_path and torch.cuda.is_available():
248
  c = c.half()
249
  with torch.no_grad():
250
  start = time.time()
251
  vol = None
252
- vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None, :])[None, :].to(
253
- self.dev) if self.vol_embedding else None
254
- audio, f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,
255
- vol=vol)
256
- audio = audio[0, 0].data.float()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  use_time = time.time() - start
258
  print("vits use time:{}".format(use_time))
259
  return audio, audio.shape[-1], n_frames
@@ -266,7 +335,7 @@ class Svc(object):
266
  # unload model
267
  self.net_g_ms = self.net_g_ms.to("cpu")
268
  del self.net_g_ms
269
- if hasattr(self, "enhancer"):
270
  self.enhancer.enhancer = self.enhancer.enhancer.to("cpu")
271
  del self.enhancer.enhancer
272
  del self.enhancer
@@ -283,14 +352,14 @@ class Svc(object):
283
  pad_seconds=0.5,
284
  clip_seconds=0,
285
  lg_num=0,
286
- lgr_num=0.75,
287
  f0_predictor='pm',
288
- enhancer_adaptive_key=0,
289
- cr_threshold=0.05,
290
- k_step=100,
291
- use_spk_mix=False,
292
- second_encoding=False,
293
- loudness_envelope_adjustment=1
294
  ):
295
  if use_spk_mix:
296
  if len(self.spk2id) == 1:
@@ -299,12 +368,12 @@ class Svc(object):
299
  wav_path = Path(raw_audio_path).with_suffix('.wav')
300
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
301
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
302
- per_size = int(clip_seconds * audio_sr)
303
- lg_size = int(lg_num * audio_sr)
304
- lg_size_r = int(lg_size * lgr_num)
305
- lg_size_c_l = (lg_size - lg_size_r) // 2
306
- lg_size_c_r = lg_size - lg_size_r - lg_size_c_l
307
- lg = np.linspace(0, 1, lg_size_r) if lg_size != 0 else 0
308
 
309
  if use_spk_mix:
310
  assert len(self.spk2id) == len(spk)
@@ -315,10 +384,10 @@ class Svc(object):
315
  audio_length += aud_length // self.hop_size
316
  continue
317
  if per_size != 0:
318
- datas = split_list_by_n(data, per_size, lg_size)
319
  else:
320
  datas = [data]
321
- for k, dat in enumerate(datas):
322
  pad_len = int(audio_sr * pad_seconds)
323
  per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
324
  a_length = per_length + 2 * pad_len
@@ -328,14 +397,14 @@ class Svc(object):
328
  for i in range(len(spk)):
329
  last_end = None
330
  for mix in spk[i]:
331
- if mix[3] < 0. or mix[2] < 0.:
332
  raise RuntimeError("mix value must higer Than zero!")
333
  begin = int(audio_length * mix[0])
334
  end = int(audio_length * mix[1])
335
  length = end - begin
336
- if length <= 0:
337
  raise RuntimeError("begin Must lower Than end!")
338
- step = (mix[3] - mix[2]) / length
339
  if last_end is not None:
340
  if last_end != begin:
341
  raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
@@ -343,20 +412,20 @@ class Svc(object):
343
  if step == 0.:
344
  spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
345
  else:
346
- spk_mix_data = torch.arange(mix[2], mix[3], step).to(self.dev)
347
- if (len(spk_mix_data) < length):
348
  num_pad = length - len(spk_mix_data)
349
  spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
350
  spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
351
 
352
- spk_mix_ten = torch.sum(spk_mix_tensor, dim=0).unsqueeze(0).to(self.dev)
353
  # spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
354
  for i, x in enumerate(spk_mix_ten[0]):
355
  if x == 0.0:
356
  spk_mix_ten[0][i] = 1.0
357
- spk_mix_tensor[:, i] = 1.0 / len(spk)
358
  spk_mix_tensor = spk_mix_tensor / spk_mix_ten
359
- if not ((torch.sum(spk_mix_tensor, dim=0) - 1.) < 0.0001).all():
360
  raise RuntimeError("sum(spk_mix_tensor) not equal 1")
361
  spk = spk_mix_tensor
362
 
@@ -373,12 +442,12 @@ class Svc(object):
373
  global_frame += length // self.hop_size
374
  continue
375
  if per_size != 0:
376
- datas = split_list_by_n(data, per_size, lg_size)
377
  else:
378
  datas = [data]
379
- for k, dat in enumerate(datas):
380
- per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds != 0 else length
381
- if clip_seconds != 0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
382
  # padd
383
  pad_len = int(audio_sr * pad_seconds)
384
  dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
@@ -386,34 +455,33 @@ class Svc(object):
386
  soundfile.write(raw_path, dat, audio_sr, format="wav")
387
  raw_path.seek(0)
388
  out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
389
- cluster_infer_ratio=cluster_infer_ratio,
390
- auto_predict_f0=auto_predict_f0,
391
- noice_scale=noice_scale,
392
- f0_predictor=f0_predictor,
393
- enhancer_adaptive_key=enhancer_adaptive_key,
394
- cr_threshold=cr_threshold,
395
- k_step=k_step,
396
- frame=global_frame,
397
- spk_mix=use_spk_mix,
398
- second_encoding=second_encoding,
399
- loudness_envelope_adjustment=loudness_envelope_adjustment
400
- )
401
  global_frame += out_frame
402
  _audio = out_audio.cpu().numpy()
403
  pad_len = int(self.target_sample * pad_seconds)
404
  _audio = _audio[pad_len:-pad_len]
405
  _audio = pad_array(_audio, per_length)
406
- if lg_size != 0 and k != 0:
407
- lg1 = audio[-(lg_size_r + lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
408
- lg2 = _audio[lg_size_c_l:lg_size_c_l + lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
409
- lg_pre = lg1 * (1 - lg) + lg2 * lg
410
- audio = audio[0:-(lg_size_r + lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
411
  audio.extend(lg_pre)
412
- _audio = _audio[lg_size_c_l + lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
413
  audio.extend(list(_audio))
414
  return np.array(audio)
415
 
416
-
417
  class RealTimeVC:
418
  def __init__(self):
419
  self.last_chunk = None
@@ -441,7 +509,7 @@ class RealTimeVC:
441
  auto_predict_f0=auto_predict_f0,
442
  noice_scale=noice_scale,
443
  f0_filter=f0_filter)
444
-
445
  audio = audio.cpu().numpy()
446
  self.last_chunk = audio[-self.pre_len:]
447
  self.last_o = audio
@@ -462,3 +530,4 @@ class RealTimeVC:
462
  self.last_chunk = audio[-self.pre_len:]
463
  self.last_o = audio
464
  return ret[self.chunk_len:2 * self.chunk_len]
 
 
 
1
  import hashlib
2
  import io
3
  import json
4
  import logging
5
  import os
 
6
  import time
7
  from pathlib import Path
8
+ from inference import slicer
9
+ import gc
10
 
11
  import librosa
12
  import numpy as np
 
17
 
18
  import cluster
19
  import utils
 
20
  from models import SynthesizerTrn
21
+ import pickle
22
+
23
+ from diffusion.unit2mel import load_model_vocoder
24
+ import yaml
25
 
26
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
27
 
 
85
  def get_md5(content):
86
  return hashlib.new("md5", content).hexdigest()
87
 
 
88
  def fill_a_to_b(a, b):
89
  if len(a) < len(b):
90
  for _ in range(0, len(b) - len(a)):
91
  a.append(a[0])
92
 
 
93
  def mkdir(paths: list):
94
  for path in paths:
95
  if not os.path.exists(path):
96
  os.mkdir(path)
97
 
 
98
  def pad_array(arr, target_length):
99
  current_length = arr.shape[0]
100
  if current_length >= target_length:
 
105
  pad_right = pad_width - pad_left
106
  padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
107
  return padded_arr
108
+
 
109
  def split_list_by_n(list_collection, n, pre=0):
110
  for i in range(0, len(list_collection), n):
111
+ yield list_collection[i-pre if i-pre>=0 else i: i + n]
112
 
113
 
114
  class F0FilterException(Exception):
115
  pass
116
 
 
117
  class Svc(object):
118
  def __init__(self, net_g_path, config_path,
119
  device=None,
120
+ cluster_model_path="logs/44k/kmeans_10000.pt",
121
+ nsf_hifigan_enhance = False,
122
+ diffusion_model_path="logs/44k/diffusion/model_0.pt",
123
+ diffusion_config_path="configs/diffusion.yaml",
124
+ shallow_diffusion = False,
125
+ only_diffusion = False,
126
+ spk_mix_enable = False,
127
+ feature_retrieval = False
128
+ ):
129
  self.net_g_path = net_g_path
130
+ self.only_diffusion = only_diffusion
131
+ self.shallow_diffusion = shallow_diffusion
132
+ self.feature_retrieval = feature_retrieval
133
  if device is None:
134
  self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
135
  else:
136
  self.dev = torch.device(device)
137
  self.net_g_ms = None
138
+ if not self.only_diffusion:
139
+ self.hps_ms = utils.get_hparams_from_file(config_path)
140
+ self.target_sample = self.hps_ms.data.sampling_rate
141
+ self.hop_size = self.hps_ms.data.hop_length
142
+ self.spk2id = self.hps_ms.spk
143
+ try:
144
+ self.vol_embedding = self.hps_ms.model.vol_embedding
145
+ except Exception as e:
146
+ self.vol_embedding = False
147
+ try:
148
+ self.speech_encoder = self.hps_ms.model.speech_encoder
149
+ except Exception as e:
150
+ self.speech_encoder = 'vec768l12'
151
+
152
+ self.nsf_hifigan_enhance = nsf_hifigan_enhance
153
+ if self.shallow_diffusion or self.only_diffusion:
154
+ if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
155
+ self.diffusion_model,self.vocoder,self.diffusion_args = load_model_vocoder(diffusion_model_path,self.dev,config_path=diffusion_config_path)
156
+ if self.only_diffusion:
157
+ self.target_sample = self.diffusion_args.data.sampling_rate
158
+ self.hop_size = self.diffusion_args.data.block_size
159
+ self.spk2id = self.diffusion_args.spk
160
+ self.speech_encoder = self.diffusion_args.data.encoder
161
+ if spk_mix_enable:
162
+ self.diffusion_model.init_spkmix(len(self.spk2id))
163
+ else:
164
+ print("No diffusion model or config found. Shallow diffusion mode will False")
165
+ self.shallow_diffusion = self.only_diffusion = False
166
+
167
+ # load hubert and model
168
+ if not self.only_diffusion:
169
+ self.load_model(spk_mix_enable)
170
+ self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
171
+ self.volume_extractor = utils.Volume_Extractor(self.hop_size)
172
+ else:
173
+ self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder,device=self.dev)
174
+ self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
175
+
176
  if os.path.exists(cluster_model_path):
177
  if self.feature_retrieval:
178
+ with open(cluster_model_path,"rb") as f:
179
  self.cluster_model = pickle.load(f)
180
  self.big_npy = None
181
  self.now_spk_id = -1
182
  else:
183
  self.cluster_model = cluster.get_cluster_model(cluster_model_path)
184
  else:
185
+ self.feature_retrieval=False
186
 
187
+ if self.shallow_diffusion : self.nsf_hifigan_enhance = False
188
+ if self.nsf_hifigan_enhance:
189
+ from modules.enhancer import Enhancer
190
+ self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
191
+
192
  def load_model(self, spk_mix_enable=False):
193
  # get model configuration
194
  self.net_g_ms = SynthesizerTrn(
 
203
  if spk_mix_enable:
204
  self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
205
 
206
+ def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
 
 
 
 
207
 
208
+ f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
209
+
210
  f0, uv = f0_predictor_object.compute_f0_uv(wav)
211
  if f0_filter and sum(f0) == 0:
212
  raise F0FilterException("No voice detected")
 
220
  wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
221
  wav16k = torch.from_numpy(wav16k).to(self.dev)
222
  c = self.hubert_model.encoder(wav16k)
 
223
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
224
 
225
+ if cluster_infer_ratio !=0:
226
  if self.feature_retrieval:
227
  speaker_id = self.spk2id.get(speaker)
228
  if speaker_id is None:
 
231
  if len(self.spk2id.__dict__) >= speaker:
232
  speaker_id = speaker
233
  feature_index = self.cluster_model[speaker_id]
234
+ feat_np = c.transpose(0,1).cpu().numpy()
235
  if self.big_npy is None or self.now_spk_id != speaker_id:
236
+ self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
237
+ self.now_spk_id = speaker_id
238
  print("starting feature retrieval...")
239
  score, ix = feature_index.search(feat_np, k=8)
240
  weight = np.square(1 / score)
241
  weight /= weight.sum(axis=1, keepdims=True)
242
  npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
243
  c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
244
+ c = torch.FloatTensor(c).to(self.dev).transpose(0,1)
245
  print("end feature retrieval...")
246
  else:
247
  cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
 
257
  noice_scale=0.4,
258
  f0_filter=False,
259
  f0_predictor='pm',
260
+ enhancer_adaptive_key = 0,
261
+ cr_threshold = 0.05,
262
+ k_step = 100,
263
+ frame = 0,
264
+ spk_mix = False,
265
+ second_encoding = False,
266
+ loudness_envelope_adjustment = 1
267
  ):
268
  wav, sr = librosa.load(raw_path, sr=self.target_sample)
269
+ if spk_mix:
270
+ c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
271
+ n_frames = f0.size(1)
272
+ sid = speaker[:, frame:frame+n_frames].transpose(0,1)
273
+ else:
274
+ speaker_id = self.spk2id.get(speaker)
275
+ if not speaker_id and type(speaker) is int:
276
+ if len(self.spk2id.__dict__) >= speaker:
277
+ speaker_id = speaker
278
+ if speaker_id is None:
279
+ raise RuntimeError("The name you entered is not in the speaker list!")
280
+ sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
281
+ c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
282
+ n_frames = f0.size(1)
283
  if "half" in self.net_g_path and torch.cuda.is_available():
284
  c = c.half()
285
  with torch.no_grad():
286
  start = time.time()
287
  vol = None
288
+ if not self.only_diffusion:
289
+ vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
290
+ audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
291
+ audio = audio[0,0].data.float()
292
+ audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
293
+ else:
294
+ audio = torch.FloatTensor(wav).to(self.dev)
295
+ audio_mel = None
296
+ if self.only_diffusion or self.shallow_diffusion:
297
+ vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol==None else vol[:,:,None]
298
+ if self.shallow_diffusion and second_encoding:
299
+ audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
300
+ audio16k = torch.from_numpy(audio16k).to(self.dev)
301
+ c = self.hubert_model.encoder(audio16k)
302
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
303
+ f0 = f0[:,:,None]
304
+ c = c.transpose(-1,-2)
305
+ audio_mel = self.diffusion_model(
306
+ c,
307
+ f0,
308
+ vol,
309
+ spk_id = sid,
310
+ spk_mix_dict = None,
311
+ gt_spec=audio_mel,
312
+ infer=True,
313
+ infer_speedup=self.diffusion_args.infer.speedup,
314
+ method=self.diffusion_args.infer.method,
315
+ k_step=k_step)
316
+ audio = self.vocoder.infer(audio_mel, f0).squeeze()
317
+ if self.nsf_hifigan_enhance:
318
+ audio, _ = self.enhancer.enhance(
319
+ audio[None,:],
320
+ self.target_sample,
321
+ f0[:,:,None],
322
+ self.hps_ms.data.hop_length,
323
+ adaptive_key = enhancer_adaptive_key)
324
+ if loudness_envelope_adjustment != 1:
325
+ audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
326
  use_time = time.time() - start
327
  print("vits use time:{}".format(use_time))
328
  return audio, audio.shape[-1], n_frames
 
335
  # unload model
336
  self.net_g_ms = self.net_g_ms.to("cpu")
337
  del self.net_g_ms
338
+ if hasattr(self,"enhancer"):
339
  self.enhancer.enhancer = self.enhancer.enhancer.to("cpu")
340
  del self.enhancer.enhancer
341
  del self.enhancer
 
352
  pad_seconds=0.5,
353
  clip_seconds=0,
354
  lg_num=0,
355
+ lgr_num =0.75,
356
  f0_predictor='pm',
357
+ enhancer_adaptive_key = 0,
358
+ cr_threshold = 0.05,
359
+ k_step = 100,
360
+ use_spk_mix = False,
361
+ second_encoding = False,
362
+ loudness_envelope_adjustment = 1
363
  ):
364
  if use_spk_mix:
365
  if len(self.spk2id) == 1:
 
368
  wav_path = Path(raw_audio_path).with_suffix('.wav')
369
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
370
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
371
+ per_size = int(clip_seconds*audio_sr)
372
+ lg_size = int(lg_num*audio_sr)
373
+ lg_size_r = int(lg_size*lgr_num)
374
+ lg_size_c_l = (lg_size-lg_size_r)//2
375
+ lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
376
+ lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
377
 
378
  if use_spk_mix:
379
  assert len(self.spk2id) == len(spk)
 
384
  audio_length += aud_length // self.hop_size
385
  continue
386
  if per_size != 0:
387
+ datas = split_list_by_n(data, per_size,lg_size)
388
  else:
389
  datas = [data]
390
+ for k,dat in enumerate(datas):
391
  pad_len = int(audio_sr * pad_seconds)
392
  per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
393
  a_length = per_length + 2 * pad_len
 
397
  for i in range(len(spk)):
398
  last_end = None
399
  for mix in spk[i]:
400
+ if mix[3]<0. or mix[2]<0.:
401
  raise RuntimeError("mix value must higer Than zero!")
402
  begin = int(audio_length * mix[0])
403
  end = int(audio_length * mix[1])
404
  length = end - begin
405
+ if length<=0:
406
  raise RuntimeError("begin Must lower Than end!")
407
+ step = (mix[3] - mix[2])/length
408
  if last_end is not None:
409
  if last_end != begin:
410
  raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
 
412
  if step == 0.:
413
  spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
414
  else:
415
+ spk_mix_data = torch.arange(mix[2],mix[3],step).to(self.dev)
416
+ if(len(spk_mix_data)<length):
417
  num_pad = length - len(spk_mix_data)
418
  spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
419
  spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
420
 
421
+ spk_mix_ten = torch.sum(spk_mix_tensor,dim=0).unsqueeze(0).to(self.dev)
422
  # spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
423
  for i, x in enumerate(spk_mix_ten[0]):
424
  if x == 0.0:
425
  spk_mix_ten[0][i] = 1.0
426
+ spk_mix_tensor[:,i] = 1.0 / len(spk)
427
  spk_mix_tensor = spk_mix_tensor / spk_mix_ten
428
+ if not ((torch.sum(spk_mix_tensor,dim=0) - 1.)<0.0001).all():
429
  raise RuntimeError("sum(spk_mix_tensor) not equal 1")
430
  spk = spk_mix_tensor
431
 
 
442
  global_frame += length // self.hop_size
443
  continue
444
  if per_size != 0:
445
+ datas = split_list_by_n(data, per_size,lg_size)
446
  else:
447
  datas = [data]
448
+ for k,dat in enumerate(datas):
449
+ per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
450
+ if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
451
  # padd
452
  pad_len = int(audio_sr * pad_seconds)
453
  dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
 
455
  soundfile.write(raw_path, dat, audio_sr, format="wav")
456
  raw_path.seek(0)
457
  out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
458
+ cluster_infer_ratio=cluster_infer_ratio,
459
+ auto_predict_f0=auto_predict_f0,
460
+ noice_scale=noice_scale,
461
+ f0_predictor = f0_predictor,
462
+ enhancer_adaptive_key = enhancer_adaptive_key,
463
+ cr_threshold = cr_threshold,
464
+ k_step = k_step,
465
+ frame = global_frame,
466
+ spk_mix = use_spk_mix,
467
+ second_encoding = second_encoding,
468
+ loudness_envelope_adjustment = loudness_envelope_adjustment
469
+ )
470
  global_frame += out_frame
471
  _audio = out_audio.cpu().numpy()
472
  pad_len = int(self.target_sample * pad_seconds)
473
  _audio = _audio[pad_len:-pad_len]
474
  _audio = pad_array(_audio, per_length)
475
+ if lg_size!=0 and k!=0:
476
+ lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
477
+ lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
478
+ lg_pre = lg1*(1-lg)+lg2*lg
479
+ audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
480
  audio.extend(lg_pre)
481
+ _audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
482
  audio.extend(list(_audio))
483
  return np.array(audio)
484
 
 
485
  class RealTimeVC:
486
  def __init__(self):
487
  self.last_chunk = None
 
509
  auto_predict_f0=auto_predict_f0,
510
  noice_scale=noice_scale,
511
  f0_filter=f0_filter)
512
+
513
  audio = audio.cpu().numpy()
514
  self.last_chunk = audio[-self.pre_len:]
515
  self.last_o = audio
 
530
  self.last_chunk = audio[-self.pre_len:]
531
  self.last_o = audio
532
  return ret[self.chunk_len:2 * self.chunk_len]
533
+
inference/inference/__init__.py DELETED
File without changes
inference/inference/chunks_temp.json DELETED
@@ -1 +0,0 @@
1
- {"info": "temp_dict"}
 
 
inference/inference/infer_tool.py DELETED
@@ -1,533 +0,0 @@
1
- import hashlib
2
- import io
3
- import json
4
- import logging
5
- import os
6
- import time
7
- from pathlib import Path
8
- from inference import slicer
9
- import gc
10
-
11
- import librosa
12
- import numpy as np
13
- # import onnxruntime
14
- import soundfile
15
- import torch
16
- import torchaudio
17
-
18
- import cluster
19
- import utils
20
- from models import SynthesizerTrn
21
- import pickle
22
-
23
- from diffusion.unit2mel import load_model_vocoder
24
- import yaml
25
-
26
- logging.getLogger('matplotlib').setLevel(logging.WARNING)
27
-
28
-
29
- def read_temp(file_name):
30
- if not os.path.exists(file_name):
31
- with open(file_name, "w") as f:
32
- f.write(json.dumps({"info": "temp_dict"}))
33
- return {}
34
- else:
35
- try:
36
- with open(file_name, "r") as f:
37
- data = f.read()
38
- data_dict = json.loads(data)
39
- if os.path.getsize(file_name) > 50 * 1024 * 1024:
40
- f_name = file_name.replace("\\", "/").split("/")[-1]
41
- print(f"clean {f_name}")
42
- for wav_hash in list(data_dict.keys()):
43
- if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
44
- del data_dict[wav_hash]
45
- except Exception as e:
46
- print(e)
47
- print(f"{file_name} error,auto rebuild file")
48
- data_dict = {"info": "temp_dict"}
49
- return data_dict
50
-
51
-
52
- def write_temp(file_name, data):
53
- with open(file_name, "w") as f:
54
- f.write(json.dumps(data))
55
-
56
-
57
- def timeit(func):
58
- def run(*args, **kwargs):
59
- t = time.time()
60
- res = func(*args, **kwargs)
61
- print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
62
- return res
63
-
64
- return run
65
-
66
-
67
- def format_wav(audio_path):
68
- if Path(audio_path).suffix == '.wav':
69
- return
70
- raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
71
- soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
72
-
73
-
74
- def get_end_file(dir_path, end):
75
- file_lists = []
76
- for root, dirs, files in os.walk(dir_path):
77
- files = [f for f in files if f[0] != '.']
78
- dirs[:] = [d for d in dirs if d[0] != '.']
79
- for f_file in files:
80
- if f_file.endswith(end):
81
- file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
82
- return file_lists
83
-
84
-
85
- def get_md5(content):
86
- return hashlib.new("md5", content).hexdigest()
87
-
88
- def fill_a_to_b(a, b):
89
- if len(a) < len(b):
90
- for _ in range(0, len(b) - len(a)):
91
- a.append(a[0])
92
-
93
- def mkdir(paths: list):
94
- for path in paths:
95
- if not os.path.exists(path):
96
- os.mkdir(path)
97
-
98
- def pad_array(arr, target_length):
99
- current_length = arr.shape[0]
100
- if current_length >= target_length:
101
- return arr
102
- else:
103
- pad_width = target_length - current_length
104
- pad_left = pad_width // 2
105
- pad_right = pad_width - pad_left
106
- padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
107
- return padded_arr
108
-
109
- def split_list_by_n(list_collection, n, pre=0):
110
- for i in range(0, len(list_collection), n):
111
- yield list_collection[i-pre if i-pre>=0 else i: i + n]
112
-
113
-
114
- class F0FilterException(Exception):
115
- pass
116
-
117
- class Svc(object):
118
- def __init__(self, net_g_path, config_path,
119
- device=None,
120
- cluster_model_path="logs/44k/kmeans_10000.pt",
121
- nsf_hifigan_enhance = False,
122
- diffusion_model_path="logs/44k/diffusion/model_0.pt",
123
- diffusion_config_path="configs/diffusion.yaml",
124
- shallow_diffusion = False,
125
- only_diffusion = False,
126
- spk_mix_enable = False,
127
- feature_retrieval = False
128
- ):
129
- self.net_g_path = net_g_path
130
- self.only_diffusion = only_diffusion
131
- self.shallow_diffusion = shallow_diffusion
132
- self.feature_retrieval = feature_retrieval
133
- if device is None:
134
- self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
135
- else:
136
- self.dev = torch.device(device)
137
- self.net_g_ms = None
138
- if not self.only_diffusion:
139
- self.hps_ms = utils.get_hparams_from_file(config_path)
140
- self.target_sample = self.hps_ms.data.sampling_rate
141
- self.hop_size = self.hps_ms.data.hop_length
142
- self.spk2id = self.hps_ms.spk
143
- try:
144
- self.vol_embedding = self.hps_ms.model.vol_embedding
145
- except Exception as e:
146
- self.vol_embedding = False
147
- try:
148
- self.speech_encoder = self.hps_ms.model.speech_encoder
149
- except Exception as e:
150
- self.speech_encoder = 'vec768l12'
151
-
152
- self.nsf_hifigan_enhance = nsf_hifigan_enhance
153
- if self.shallow_diffusion or self.only_diffusion:
154
- if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
155
- self.diffusion_model,self.vocoder,self.diffusion_args = load_model_vocoder(diffusion_model_path,self.dev,config_path=diffusion_config_path)
156
- if self.only_diffusion:
157
- self.target_sample = self.diffusion_args.data.sampling_rate
158
- self.hop_size = self.diffusion_args.data.block_size
159
- self.spk2id = self.diffusion_args.spk
160
- self.speech_encoder = self.diffusion_args.data.encoder
161
- if spk_mix_enable:
162
- self.diffusion_model.init_spkmix(len(self.spk2id))
163
- else:
164
- print("No diffusion model or config found. Shallow diffusion mode will False")
165
- self.shallow_diffusion = self.only_diffusion = False
166
-
167
- # load hubert and model
168
- if not self.only_diffusion:
169
- self.load_model(spk_mix_enable)
170
- self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
171
- self.volume_extractor = utils.Volume_Extractor(self.hop_size)
172
- else:
173
- self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder,device=self.dev)
174
- self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
175
-
176
- if os.path.exists(cluster_model_path):
177
- if self.feature_retrieval:
178
- with open(cluster_model_path,"rb") as f:
179
- self.cluster_model = pickle.load(f)
180
- self.big_npy = None
181
- self.now_spk_id = -1
182
- else:
183
- self.cluster_model = cluster.get_cluster_model(cluster_model_path)
184
- else:
185
- self.feature_retrieval=False
186
-
187
- if self.shallow_diffusion : self.nsf_hifigan_enhance = False
188
- if self.nsf_hifigan_enhance:
189
- from modules.enhancer import Enhancer
190
- self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
191
-
192
- def load_model(self, spk_mix_enable=False):
193
- # get model configuration
194
- self.net_g_ms = SynthesizerTrn(
195
- self.hps_ms.data.filter_length // 2 + 1,
196
- self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
197
- **self.hps_ms.model)
198
- _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
199
- if "half" in self.net_g_path and torch.cuda.is_available():
200
- _ = self.net_g_ms.half().eval().to(self.dev)
201
- else:
202
- _ = self.net_g_ms.eval().to(self.dev)
203
- if spk_mix_enable:
204
- self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
205
-
206
- def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
207
-
208
- f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
209
-
210
- f0, uv = f0_predictor_object.compute_f0_uv(wav)
211
- if f0_filter and sum(f0) == 0:
212
- raise F0FilterException("No voice detected")
213
- f0 = torch.FloatTensor(f0).to(self.dev)
214
- uv = torch.FloatTensor(uv).to(self.dev)
215
-
216
- f0 = f0 * 2 ** (tran / 12)
217
- f0 = f0.unsqueeze(0)
218
- uv = uv.unsqueeze(0)
219
-
220
- wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
221
- wav16k = torch.from_numpy(wav16k).to(self.dev)
222
- c = self.hubert_model.encoder(wav16k)
223
- c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
224
-
225
- if cluster_infer_ratio !=0:
226
- if self.feature_retrieval:
227
- speaker_id = self.spk2id.get(speaker)
228
- if speaker_id is None:
229
- raise RuntimeError("The name you entered is not in the speaker list!")
230
- if not speaker_id and type(speaker) is int:
231
- if len(self.spk2id.__dict__) >= speaker:
232
- speaker_id = speaker
233
- feature_index = self.cluster_model[speaker_id]
234
- feat_np = c.transpose(0,1).cpu().numpy()
235
- if self.big_npy is None or self.now_spk_id != speaker_id:
236
- self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
237
- self.now_spk_id = speaker_id
238
- print("starting feature retrieval...")
239
- score, ix = feature_index.search(feat_np, k=8)
240
- weight = np.square(1 / score)
241
- weight /= weight.sum(axis=1, keepdims=True)
242
- npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
243
- c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
244
- c = torch.FloatTensor(c).to(self.dev).transpose(0,1)
245
- print("end feature retrieval...")
246
- else:
247
- cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
248
- cluster_c = torch.FloatTensor(cluster_c).to(self.dev)
249
- c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c
250
-
251
- c = c.unsqueeze(0)
252
- return c, f0, uv
253
-
254
- def infer(self, speaker, tran, raw_path,
255
- cluster_infer_ratio=0,
256
- auto_predict_f0=False,
257
- noice_scale=0.4,
258
- f0_filter=False,
259
- f0_predictor='pm',
260
- enhancer_adaptive_key = 0,
261
- cr_threshold = 0.05,
262
- k_step = 100,
263
- frame = 0,
264
- spk_mix = False,
265
- second_encoding = False,
266
- loudness_envelope_adjustment = 1
267
- ):
268
- wav, sr = librosa.load(raw_path, sr=self.target_sample)
269
- if spk_mix:
270
- c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
271
- n_frames = f0.size(1)
272
- sid = speaker[:, frame:frame+n_frames].transpose(0,1)
273
- else:
274
- speaker_id = self.spk2id.get(speaker)
275
- if not speaker_id and type(speaker) is int:
276
- if len(self.spk2id.__dict__) >= speaker:
277
- speaker_id = speaker
278
- if speaker_id is None:
279
- raise RuntimeError("The name you entered is not in the speaker list!")
280
- sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
281
- c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
282
- n_frames = f0.size(1)
283
- if "half" in self.net_g_path and torch.cuda.is_available():
284
- c = c.half()
285
- with torch.no_grad():
286
- start = time.time()
287
- vol = None
288
- if not self.only_diffusion:
289
- vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
290
- audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
291
- audio = audio[0,0].data.float()
292
- audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
293
- else:
294
- audio = torch.FloatTensor(wav).to(self.dev)
295
- audio_mel = None
296
- if self.only_diffusion or self.shallow_diffusion:
297
- vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol==None else vol[:,:,None]
298
- if self.shallow_diffusion and second_encoding:
299
- audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
300
- audio16k = torch.from_numpy(audio16k).to(self.dev)
301
- c = self.hubert_model.encoder(audio16k)
302
- c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
303
- f0 = f0[:,:,None]
304
- c = c.transpose(-1,-2)
305
- audio_mel = self.diffusion_model(
306
- c,
307
- f0,
308
- vol,
309
- spk_id = sid,
310
- spk_mix_dict = None,
311
- gt_spec=audio_mel,
312
- infer=True,
313
- infer_speedup=self.diffusion_args.infer.speedup,
314
- method=self.diffusion_args.infer.method,
315
- k_step=k_step)
316
- audio = self.vocoder.infer(audio_mel, f0).squeeze()
317
- if self.nsf_hifigan_enhance:
318
- audio, _ = self.enhancer.enhance(
319
- audio[None,:],
320
- self.target_sample,
321
- f0[:,:,None],
322
- self.hps_ms.data.hop_length,
323
- adaptive_key = enhancer_adaptive_key)
324
- if loudness_envelope_adjustment != 1:
325
- audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
326
- use_time = time.time() - start
327
- print("vits use time:{}".format(use_time))
328
- return audio, audio.shape[-1], n_frames
329
-
330
- def clear_empty(self):
331
- # clean up vram
332
- torch.cuda.empty_cache()
333
-
334
- def unload_model(self):
335
- # unload model
336
- self.net_g_ms = self.net_g_ms.to("cpu")
337
- del self.net_g_ms
338
- if hasattr(self,"enhancer"):
339
- self.enhancer.enhancer = self.enhancer.enhancer.to("cpu")
340
- del self.enhancer.enhancer
341
- del self.enhancer
342
- gc.collect()
343
-
344
- def slice_inference(self,
345
- raw_audio_path,
346
- spk,
347
- tran,
348
- slice_db,
349
- cluster_infer_ratio,
350
- auto_predict_f0,
351
- noice_scale,
352
- pad_seconds=0.5,
353
- clip_seconds=0,
354
- lg_num=0,
355
- lgr_num =0.75,
356
- f0_predictor='pm',
357
- enhancer_adaptive_key = 0,
358
- cr_threshold = 0.05,
359
- k_step = 100,
360
- use_spk_mix = False,
361
- second_encoding = False,
362
- loudness_envelope_adjustment = 1
363
- ):
364
- if use_spk_mix:
365
- if len(self.spk2id) == 1:
366
- spk = self.spk2id.keys()[0]
367
- use_spk_mix = False
368
- wav_path = Path(raw_audio_path).with_suffix('.wav')
369
- chunks = slicer.cut(wav_path, db_thresh=slice_db)
370
- audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
371
- per_size = int(clip_seconds*audio_sr)
372
- lg_size = int(lg_num*audio_sr)
373
- lg_size_r = int(lg_size*lgr_num)
374
- lg_size_c_l = (lg_size-lg_size_r)//2
375
- lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
376
- lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
377
-
378
- if use_spk_mix:
379
- assert len(self.spk2id) == len(spk)
380
- audio_length = 0
381
- for (slice_tag, data) in audio_data:
382
- aud_length = int(np.ceil(len(data) / audio_sr * self.target_sample))
383
- if slice_tag:
384
- audio_length += aud_length // self.hop_size
385
- continue
386
- if per_size != 0:
387
- datas = split_list_by_n(data, per_size,lg_size)
388
- else:
389
- datas = [data]
390
- for k,dat in enumerate(datas):
391
- pad_len = int(audio_sr * pad_seconds)
392
- per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
393
- a_length = per_length + 2 * pad_len
394
- audio_length += a_length // self.hop_size
395
- audio_length += len(audio_data)
396
- spk_mix_tensor = torch.zeros(size=(len(spk), audio_length)).to(self.dev)
397
- for i in range(len(spk)):
398
- last_end = None
399
- for mix in spk[i]:
400
- if mix[3]<0. or mix[2]<0.:
401
- raise RuntimeError("mix value must higer Than zero!")
402
- begin = int(audio_length * mix[0])
403
- end = int(audio_length * mix[1])
404
- length = end - begin
405
- if length<=0:
406
- raise RuntimeError("begin Must lower Than end!")
407
- step = (mix[3] - mix[2])/length
408
- if last_end is not None:
409
- if last_end != begin:
410
- raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
411
- last_end = end
412
- if step == 0.:
413
- spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
414
- else:
415
- spk_mix_data = torch.arange(mix[2],mix[3],step).to(self.dev)
416
- if(len(spk_mix_data)<length):
417
- num_pad = length - len(spk_mix_data)
418
- spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
419
- spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
420
-
421
- spk_mix_ten = torch.sum(spk_mix_tensor,dim=0).unsqueeze(0).to(self.dev)
422
- # spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
423
- for i, x in enumerate(spk_mix_ten[0]):
424
- if x == 0.0:
425
- spk_mix_ten[0][i] = 1.0
426
- spk_mix_tensor[:,i] = 1.0 / len(spk)
427
- spk_mix_tensor = spk_mix_tensor / spk_mix_ten
428
- if not ((torch.sum(spk_mix_tensor,dim=0) - 1.)<0.0001).all():
429
- raise RuntimeError("sum(spk_mix_tensor) not equal 1")
430
- spk = spk_mix_tensor
431
-
432
- global_frame = 0
433
- audio = []
434
- for (slice_tag, data) in audio_data:
435
- print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
436
- # padd
437
- length = int(np.ceil(len(data) / audio_sr * self.target_sample))
438
- if slice_tag:
439
- print('jump empty segment')
440
- _audio = np.zeros(length)
441
- audio.extend(list(pad_array(_audio, length)))
442
- global_frame += length // self.hop_size
443
- continue
444
- if per_size != 0:
445
- datas = split_list_by_n(data, per_size,lg_size)
446
- else:
447
- datas = [data]
448
- for k,dat in enumerate(datas):
449
- per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
450
- if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
451
- # padd
452
- pad_len = int(audio_sr * pad_seconds)
453
- dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
454
- raw_path = io.BytesIO()
455
- soundfile.write(raw_path, dat, audio_sr, format="wav")
456
- raw_path.seek(0)
457
- out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
458
- cluster_infer_ratio=cluster_infer_ratio,
459
- auto_predict_f0=auto_predict_f0,
460
- noice_scale=noice_scale,
461
- f0_predictor = f0_predictor,
462
- enhancer_adaptive_key = enhancer_adaptive_key,
463
- cr_threshold = cr_threshold,
464
- k_step = k_step,
465
- frame = global_frame,
466
- spk_mix = use_spk_mix,
467
- second_encoding = second_encoding,
468
- loudness_envelope_adjustment = loudness_envelope_adjustment
469
- )
470
- global_frame += out_frame
471
- _audio = out_audio.cpu().numpy()
472
- pad_len = int(self.target_sample * pad_seconds)
473
- _audio = _audio[pad_len:-pad_len]
474
- _audio = pad_array(_audio, per_length)
475
- if lg_size!=0 and k!=0:
476
- lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
477
- lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
478
- lg_pre = lg1*(1-lg)+lg2*lg
479
- audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
480
- audio.extend(lg_pre)
481
- _audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
482
- audio.extend(list(_audio))
483
- return np.array(audio)
484
-
485
- class RealTimeVC:
486
- def __init__(self):
487
- self.last_chunk = None
488
- self.last_o = None
489
- self.chunk_len = 16000 # chunk length
490
- self.pre_len = 3840 # cross fade length, multiples of 640
491
-
492
- # Input and output are 1-dimensional numpy waveform arrays
493
-
494
- def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path,
495
- cluster_infer_ratio=0,
496
- auto_predict_f0=False,
497
- noice_scale=0.4,
498
- f0_filter=False):
499
-
500
- import maad
501
- audio, sr = torchaudio.load(input_wav_path)
502
- audio = audio.cpu().numpy()[0]
503
- temp_wav = io.BytesIO()
504
- if self.last_chunk is None:
505
- input_wav_path.seek(0)
506
-
507
- audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path,
508
- cluster_infer_ratio=cluster_infer_ratio,
509
- auto_predict_f0=auto_predict_f0,
510
- noice_scale=noice_scale,
511
- f0_filter=f0_filter)
512
-
513
- audio = audio.cpu().numpy()
514
- self.last_chunk = audio[-self.pre_len:]
515
- self.last_o = audio
516
- return audio[-self.chunk_len:]
517
- else:
518
- audio = np.concatenate([self.last_chunk, audio])
519
- soundfile.write(temp_wav, audio, sr, format="wav")
520
- temp_wav.seek(0)
521
-
522
- audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav,
523
- cluster_infer_ratio=cluster_infer_ratio,
524
- auto_predict_f0=auto_predict_f0,
525
- noice_scale=noice_scale,
526
- f0_filter=f0_filter)
527
-
528
- audio = audio.cpu().numpy()
529
- ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
530
- self.last_chunk = audio[-self.pre_len:]
531
- self.last_o = audio
532
- return ret[self.chunk_len:2 * self.chunk_len]
533
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
inference/inference/infer_tool_grad.py DELETED
@@ -1,160 +0,0 @@
1
- import hashlib
2
- import json
3
- import logging
4
- import os
5
- import time
6
- from pathlib import Path
7
- import io
8
- import librosa
9
- import maad
10
- import numpy as np
11
- from inference import slicer
12
- import parselmouth
13
- import soundfile
14
- import torch
15
- import torchaudio
16
-
17
- from hubert import hubert_model
18
- import utils
19
- from models import SynthesizerTrn
20
- logging.getLogger('numba').setLevel(logging.WARNING)
21
- logging.getLogger('matplotlib').setLevel(logging.WARNING)
22
-
23
- def resize2d_f0(x, target_len):
24
- source = np.array(x)
25
- source[source < 0.001] = np.nan
26
- target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
27
- source)
28
- res = np.nan_to_num(target)
29
- return res
30
-
31
- def get_f0(x, p_len,f0_up_key=0):
32
-
33
- time_step = 160 / 16000 * 1000
34
- f0_min = 50
35
- f0_max = 1100
36
- f0_mel_min = 1127 * np.log(1 + f0_min / 700)
37
- f0_mel_max = 1127 * np.log(1 + f0_max / 700)
38
-
39
- f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
40
- time_step=time_step / 1000, voicing_threshold=0.6,
41
- pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
42
-
43
- pad_size=(p_len - len(f0) + 1) // 2
44
- if(pad_size>0 or p_len - len(f0) - pad_size>0):
45
- f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
46
-
47
- f0 *= pow(2, f0_up_key / 12)
48
- f0_mel = 1127 * np.log(1 + f0 / 700)
49
- f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
50
- f0_mel[f0_mel <= 1] = 1
51
- f0_mel[f0_mel > 255] = 255
52
- f0_coarse = np.rint(f0_mel).astype(np.int)
53
- return f0_coarse, f0
54
-
55
- def clean_pitch(input_pitch):
56
- num_nan = np.sum(input_pitch == 1)
57
- if num_nan / len(input_pitch) > 0.9:
58
- input_pitch[input_pitch != 1] = 1
59
- return input_pitch
60
-
61
-
62
- def plt_pitch(input_pitch):
63
- input_pitch = input_pitch.astype(float)
64
- input_pitch[input_pitch == 1] = np.nan
65
- return input_pitch
66
-
67
-
68
- def f0_to_pitch(ff):
69
- f0_pitch = 69 + 12 * np.log2(ff / 440)
70
- return f0_pitch
71
-
72
-
73
- def fill_a_to_b(a, b):
74
- if len(a) < len(b):
75
- for _ in range(0, len(b) - len(a)):
76
- a.append(a[0])
77
-
78
-
79
- def mkdir(paths: list):
80
- for path in paths:
81
- if not os.path.exists(path):
82
- os.mkdir(path)
83
-
84
-
85
- class VitsSvc(object):
86
- def __init__(self):
87
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88
- self.SVCVITS = None
89
- self.hps = None
90
- self.speakers = None
91
- self.hubert_soft = utils.get_hubert_model()
92
-
93
- def set_device(self, device):
94
- self.device = torch.device(device)
95
- self.hubert_soft.to(self.device)
96
- if self.SVCVITS != None:
97
- self.SVCVITS.to(self.device)
98
-
99
- def loadCheckpoint(self, path):
100
- self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
101
- self.SVCVITS = SynthesizerTrn(
102
- self.hps.data.filter_length // 2 + 1,
103
- self.hps.train.segment_size // self.hps.data.hop_length,
104
- **self.hps.model)
105
- _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.SVCVITS, None)
106
- _ = self.SVCVITS.eval().to(self.device)
107
- self.speakers = self.hps.spk
108
-
109
- def get_units(self, source, sr):
110
- source = source.unsqueeze(0).to(self.device)
111
- with torch.inference_mode():
112
- units = self.hubert_soft.units(source)
113
- return units
114
-
115
-
116
- def get_unit_pitch(self, in_path, tran):
117
- source, sr = torchaudio.load(in_path)
118
- source = torchaudio.functional.resample(source, sr, 16000)
119
- if len(source.shape) == 2 and source.shape[1] >= 2:
120
- source = torch.mean(source, dim=0).unsqueeze(0)
121
- soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
122
- f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran)
123
- return soft, f0
124
-
125
- def infer(self, speaker_id, tran, raw_path):
126
- speaker_id = self.speakers[speaker_id]
127
- sid = torch.LongTensor([int(speaker_id)]).to(self.device).unsqueeze(0)
128
- soft, pitch = self.get_unit_pitch(raw_path, tran)
129
- f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device)
130
- stn_tst = torch.FloatTensor(soft)
131
- with torch.no_grad():
132
- x_tst = stn_tst.unsqueeze(0).to(self.device)
133
- x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
134
- audio,_ = self.SVCVITS.infer(x_tst, f0=f0, g=sid)[0,0].data.float()
135
- return audio, audio.shape[-1]
136
-
137
- def inference(self,srcaudio,chara,tran,slice_db):
138
- sampling_rate, audio = srcaudio
139
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
140
- if len(audio.shape) > 1:
141
- audio = librosa.to_mono(audio.transpose(1, 0))
142
- if sampling_rate != 16000:
143
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
144
- soundfile.write("tmpwav.wav", audio, 16000, format="wav")
145
- chunks = slicer.cut("tmpwav.wav", db_thresh=slice_db)
146
- audio_data, audio_sr = slicer.chunks2audio("tmpwav.wav", chunks)
147
- audio = []
148
- for (slice_tag, data) in audio_data:
149
- length = int(np.ceil(len(data) / audio_sr * self.hps.data.sampling_rate))
150
- raw_path = io.BytesIO()
151
- soundfile.write(raw_path, data, audio_sr, format="wav")
152
- raw_path.seek(0)
153
- if slice_tag:
154
- _audio = np.zeros(length)
155
- else:
156
- out_audio, out_sr = self.infer(chara, tran, raw_path)
157
- _audio = out_audio.cpu().numpy()
158
- audio.extend(list(_audio))
159
- audio = (np.array(audio) * 32768.0).astype('int16')
160
- return (self.hps.data.sampling_rate,audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
inference/inference/slicer.py DELETED
@@ -1,142 +0,0 @@
1
- import librosa
2
- import torch
3
- import torchaudio
4
-
5
-
6
- class Slicer:
7
- def __init__(self,
8
- sr: int,
9
- threshold: float = -40.,
10
- min_length: int = 5000,
11
- min_interval: int = 300,
12
- hop_size: int = 20,
13
- max_sil_kept: int = 5000):
14
- if not min_length >= min_interval >= hop_size:
15
- raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
16
- if not max_sil_kept >= hop_size:
17
- raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
18
- min_interval = sr * min_interval / 1000
19
- self.threshold = 10 ** (threshold / 20.)
20
- self.hop_size = round(sr * hop_size / 1000)
21
- self.win_size = min(round(min_interval), 4 * self.hop_size)
22
- self.min_length = round(sr * min_length / 1000 / self.hop_size)
23
- self.min_interval = round(min_interval / self.hop_size)
24
- self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
25
-
26
- def _apply_slice(self, waveform, begin, end):
27
- if len(waveform.shape) > 1:
28
- return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
29
- else:
30
- return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
31
-
32
- # @timeit
33
- def slice(self, waveform):
34
- if len(waveform.shape) > 1:
35
- samples = librosa.to_mono(waveform)
36
- else:
37
- samples = waveform
38
- if samples.shape[0] <= self.min_length:
39
- return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
40
- rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
41
- sil_tags = []
42
- silence_start = None
43
- clip_start = 0
44
- for i, rms in enumerate(rms_list):
45
- # Keep looping while frame is silent.
46
- if rms < self.threshold:
47
- # Record start of silent frames.
48
- if silence_start is None:
49
- silence_start = i
50
- continue
51
- # Keep looping while frame is not silent and silence start has not been recorded.
52
- if silence_start is None:
53
- continue
54
- # Clear recorded silence start if interval is not enough or clip is too short
55
- is_leading_silence = silence_start == 0 and i > self.max_sil_kept
56
- need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
57
- if not is_leading_silence and not need_slice_middle:
58
- silence_start = None
59
- continue
60
- # Need slicing. Record the range of silent frames to be removed.
61
- if i - silence_start <= self.max_sil_kept:
62
- pos = rms_list[silence_start: i + 1].argmin() + silence_start
63
- if silence_start == 0:
64
- sil_tags.append((0, pos))
65
- else:
66
- sil_tags.append((pos, pos))
67
- clip_start = pos
68
- elif i - silence_start <= self.max_sil_kept * 2:
69
- pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
70
- pos += i - self.max_sil_kept
71
- pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
72
- pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
73
- if silence_start == 0:
74
- sil_tags.append((0, pos_r))
75
- clip_start = pos_r
76
- else:
77
- sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
78
- clip_start = max(pos_r, pos)
79
- else:
80
- pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
81
- pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
82
- if silence_start == 0:
83
- sil_tags.append((0, pos_r))
84
- else:
85
- sil_tags.append((pos_l, pos_r))
86
- clip_start = pos_r
87
- silence_start = None
88
- # Deal with trailing silence.
89
- total_frames = rms_list.shape[0]
90
- if silence_start is not None and total_frames - silence_start >= self.min_interval:
91
- silence_end = min(total_frames, silence_start + self.max_sil_kept)
92
- pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
93
- sil_tags.append((pos, total_frames + 1))
94
- # Apply and return slices.
95
- if len(sil_tags) == 0:
96
- return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
97
- else:
98
- chunks = []
99
- # 第一段静音并非从头开始,补上有声片段
100
- if sil_tags[0][0]:
101
- chunks.append(
102
- {"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
103
- for i in range(0, len(sil_tags)):
104
- # 标识有声片段(跳过第一段)
105
- if i:
106
- chunks.append({"slice": False,
107
- "split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
108
- # 标识所有静音片段
109
- chunks.append({"slice": True,
110
- "split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
111
- # 最后一段静音并非结尾,补上结尾片段
112
- if sil_tags[-1][1] * self.hop_size < len(waveform):
113
- chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
114
- chunk_dict = {}
115
- for i in range(len(chunks)):
116
- chunk_dict[str(i)] = chunks[i]
117
- return chunk_dict
118
-
119
-
120
- def cut(audio_path, db_thresh=-30, min_len=5000):
121
- audio, sr = librosa.load(audio_path, sr=None)
122
- slicer = Slicer(
123
- sr=sr,
124
- threshold=db_thresh,
125
- min_length=min_len
126
- )
127
- chunks = slicer.slice(audio)
128
- return chunks
129
-
130
-
131
- def chunks2audio(audio_path, chunks):
132
- chunks = dict(chunks)
133
- audio, sr = torchaudio.load(audio_path)
134
- if len(audio.shape) == 2 and audio.shape[1] >= 2:
135
- audio = torch.mean(audio, dim=0).unsqueeze(0)
136
- audio = audio.cpu().numpy()[0]
137
- result = []
138
- for k, v in chunks.items():
139
- tag = v["split_time"].split(",")
140
- if tag[0] != tag[1]:
141
- result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
142
- return result, sr