Katock commited on
Commit
f85ad87
·
1 Parent(s): 040c3ba
Files changed (4) hide show
  1. data_utils.py +44 -15
  2. inference_main.py +121 -70
  3. models.py +272 -223
  4. utils.py +149 -140
data_utils.py CHANGED
@@ -7,7 +7,7 @@ import torch.utils.data
7
 
8
  import modules.commons as commons
9
  import utils
10
- from modules.mel_processing import spectrogram_torch, spec_to_mel_torch
11
  from utils import load_wav_to_torch, load_filepaths_and_text
12
 
13
  # import h5py
@@ -23,8 +23,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
23
  3) computes spectrograms from audio files.
24
  """
25
 
26
- def __init__(self, audiopaths, hparams, all_in_mem: bool = False):
27
  self.audiopaths = load_filepaths_and_text(audiopaths)
 
28
  self.max_wav_value = hparams.data.max_wav_value
29
  self.sampling_rate = hparams.data.sampling_rate
30
  self.filter_length = hparams.data.filter_length
@@ -34,7 +35,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
34
  self.use_sr = hparams.train.use_sr
35
  self.spec_len = hparams.train.max_speclen
36
  self.spk_map = hparams.spk
37
-
 
38
  random.seed(1234)
39
  random.shuffle(self.audiopaths)
40
 
@@ -65,34 +67,55 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
65
  spk = filename.split("/")[-2]
66
  spk = torch.LongTensor([self.spk_map[spk]])
67
 
68
- f0 = np.load(filename + ".f0.npy")
69
- f0, uv = utils.interpolate_f0(f0)
70
- f0 = torch.FloatTensor(f0)
71
- uv = torch.FloatTensor(uv)
72
 
73
  c = torch.load(filename+ ".soft.pt")
74
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0])
75
-
 
 
 
 
 
76
 
77
  lmin = min(c.size(-1), spec.size(-1))
78
  assert abs(c.size(-1) - spec.size(-1)) < 3, (c.size(-1), spec.size(-1), f0.shape, filename)
79
  assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
80
  spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
81
  audio_norm = audio_norm[:, :lmin * self.hop_length]
 
 
 
82
 
83
- return c, f0, spec, audio_norm, spk, uv
84
-
85
- def random_slice(self, c, f0, spec, audio_norm, spk, uv):
86
  # if spec.shape[1] < 30:
87
  # print("skip too short audio:", filename)
88
  # return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  if spec.shape[1] > 800:
90
  start = random.randint(0, spec.shape[1]-800)
91
  end = start + 790
92
  spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
93
  audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
94
-
95
- return c, f0, spec, audio_norm, spk, uv
 
96
 
97
  def __getitem__(self, index):
98
  if self.all_in_mem:
@@ -124,12 +147,14 @@ class TextAudioCollate:
124
  wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
125
  spkids = torch.LongTensor(len(batch), 1)
126
  uv_padded = torch.FloatTensor(len(batch), max_c_len)
 
127
 
128
  c_padded.zero_()
129
  spec_padded.zero_()
130
  f0_padded.zero_()
131
  wav_padded.zero_()
132
  uv_padded.zero_()
 
133
 
134
  for i in range(len(ids_sorted_decreasing)):
135
  row = batch[ids_sorted_decreasing[i]]
@@ -151,5 +176,9 @@ class TextAudioCollate:
151
 
152
  uv = row[5]
153
  uv_padded[i, :uv.size(0)] = uv
154
-
155
- return c_padded, f0_padded, spec_padded, wav_padded, spkids, lengths, uv_padded
 
 
 
 
 
7
 
8
  import modules.commons as commons
9
  import utils
10
+ from modules.mel_processing import spectrogram_torch, spec_to_mel_torch, spectrogram_torch
11
  from utils import load_wav_to_torch, load_filepaths_and_text
12
 
13
  # import h5py
 
23
  3) computes spectrograms from audio files.
24
  """
25
 
26
+ def __init__(self, audiopaths, hparams, all_in_mem: bool = False, vol_aug: bool = True):
27
  self.audiopaths = load_filepaths_and_text(audiopaths)
28
+ self.hparams = hparams
29
  self.max_wav_value = hparams.data.max_wav_value
30
  self.sampling_rate = hparams.data.sampling_rate
31
  self.filter_length = hparams.data.filter_length
 
35
  self.use_sr = hparams.train.use_sr
36
  self.spec_len = hparams.train.max_speclen
37
  self.spk_map = hparams.spk
38
+ self.vol_emb = hparams.model.vol_embedding
39
+ self.vol_aug = hparams.train.vol_aug and vol_aug
40
  random.seed(1234)
41
  random.shuffle(self.audiopaths)
42
 
 
67
  spk = filename.split("/")[-2]
68
  spk = torch.LongTensor([self.spk_map[spk]])
69
 
70
+ f0, uv = np.load(filename + ".f0.npy",allow_pickle=True)
71
+
72
+ f0 = torch.FloatTensor(np.array(f0,dtype=float))
73
+ uv = torch.FloatTensor(np.array(uv,dtype=float))
74
 
75
  c = torch.load(filename+ ".soft.pt")
76
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0])
77
+ if self.vol_emb:
78
+ volume_path = filename + ".vol.npy"
79
+ volume = np.load(volume_path)
80
+ volume = torch.from_numpy(volume).float()
81
+ else:
82
+ volume = None
83
 
84
  lmin = min(c.size(-1), spec.size(-1))
85
  assert abs(c.size(-1) - spec.size(-1)) < 3, (c.size(-1), spec.size(-1), f0.shape, filename)
86
  assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
87
  spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
88
  audio_norm = audio_norm[:, :lmin * self.hop_length]
89
+ if volume!= None:
90
+ volume = volume[:lmin]
91
+ return c, f0, spec, audio_norm, spk, uv, volume
92
 
93
+ def random_slice(self, c, f0, spec, audio_norm, spk, uv, volume):
 
 
94
  # if spec.shape[1] < 30:
95
  # print("skip too short audio:", filename)
96
  # return None
97
+
98
+ if random.choice([True, False]) and self.vol_aug and volume!=None:
99
+ max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
100
+ max_shift = min(1, np.log10(1/max_amp))
101
+ log10_vol_shift = random.uniform(-1, max_shift)
102
+ audio_norm = audio_norm * (10 ** log10_vol_shift)
103
+ volume = volume * (10 ** log10_vol_shift)
104
+ spec = spectrogram_torch(audio_norm,
105
+ self.hparams.data.filter_length,
106
+ self.hparams.data.sampling_rate,
107
+ self.hparams.data.hop_length,
108
+ self.hparams.data.win_length,
109
+ center=False)[0]
110
+
111
  if spec.shape[1] > 800:
112
  start = random.randint(0, spec.shape[1]-800)
113
  end = start + 790
114
  spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
115
  audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
116
+ if volume !=None:
117
+ volume = volume[start:end]
118
+ return c, f0, spec, audio_norm, spk, uv,volume
119
 
120
  def __getitem__(self, index):
121
  if self.all_in_mem:
 
147
  wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
148
  spkids = torch.LongTensor(len(batch), 1)
149
  uv_padded = torch.FloatTensor(len(batch), max_c_len)
150
+ volume_padded = torch.FloatTensor(len(batch), max_c_len)
151
 
152
  c_padded.zero_()
153
  spec_padded.zero_()
154
  f0_padded.zero_()
155
  wav_padded.zero_()
156
  uv_padded.zero_()
157
+ volume_padded.zero_()
158
 
159
  for i in range(len(ids_sorted_decreasing)):
160
  row = batch[ids_sorted_decreasing[i]]
 
176
 
177
  uv = row[5]
178
  uv_padded[i, :uv.size(0)] = uv
179
+ volume = row[6]
180
+ if volume != None:
181
+ volume_padded[i, :volume.size(0)] = volume
182
+ else :
183
+ volume_padded = None
184
+ return c_padded, f0_padded, spec_padded, wav_padded, spkids, lengths, uv_padded, volume_padded
inference_main.py CHANGED
@@ -2,12 +2,11 @@ import io
2
  import logging
3
  import time
4
  from pathlib import Path
5
-
6
  import librosa
7
  import matplotlib.pyplot as plt
8
  import numpy as np
9
  import soundfile
10
-
11
  from inference import infer_tool
12
  from inference import slicer
13
  from inference.infer_tool import Svc
@@ -16,39 +15,84 @@ logging.getLogger('numba').setLevel(logging.WARNING)
16
  chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
17
 
18
 
19
-
20
  def main():
21
  import argparse
22
 
23
  parser = argparse.ArgumentParser(description='sovits4 inference')
24
 
25
  # 一定要设置的部分
26
- parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_0.pth", help='模型路径')
27
- parser.add_argument('-c', '--config_path', type=str, default="configs/config.json", help='配置文件路径')
28
  parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片,默认0为自动切片,单位为秒/s')
29
- parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表,放在raw文件夹下')
 
30
  parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
31
- parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目标说话人名称')
32
 
33
  # 可选项部分
34
- parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False,help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
35
- parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt", help='聚类模型路径,如果没有训练聚类则随便填')
36
- parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案占比,范围0-1,若没有训练聚类模型则默认0即可')
37
- parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
38
- parser.add_argument('-fmp', '--f0_mean_pooling', type=bool, default=False, help='是否对F0使用均值滤波器(池化),对部分哑音有改善。注意,启动该选项会导致推理速度下降,默认关闭')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # 不用动的部分
41
- parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
 
42
  parser.add_argument('-d', '--device', type=str, default=None, help='推理设备,None则为自动选择cpu和gpu')
43
  parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='噪音级别,会影响咬字和音质,较为玄学')
44
- parser.add_argument('-p', '--pad_seconds', type=float, default=0.5, help='推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现')
 
45
  parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
46
- parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  args = parser.parse_args()
 
49
 
50
- svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path)
51
- infer_tool.mkdir(["raw", "results"])
52
  clean_names = args.clean_names
53
  trans = args.trans
54
  spk_list = args.spk_list
@@ -61,7 +105,37 @@ def main():
61
  clip = args.clip
62
  lg = args.linear_gradient
63
  lgr = args.linear_gradient_retain
64
- F0_mean_pooling = args.f0_mean_pooling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  infer_tool.fill_a_to_b(trans, clean_names)
67
  for clean_name, tran in zip(clean_names, trans):
@@ -69,62 +143,39 @@ def main():
69
  if "." not in raw_audio_path:
70
  raw_audio_path += ".wav"
71
  infer_tool.format_wav(raw_audio_path)
72
- wav_path = Path(raw_audio_path).with_suffix('.wav')
73
- chunks = slicer.cut(wav_path, db_thresh=slice_db)
74
- audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
75
- per_size = int(clip*audio_sr)
76
- lg_size = int(lg*audio_sr)
77
- lg_size_r = int(lg_size*lgr)
78
- lg_size_c_l = (lg_size-lg_size_r)//2
79
- lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
80
- lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
81
-
82
  for spk in spk_list:
83
- audio = []
84
- for (slice_tag, data) in audio_data:
85
- print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
86
-
87
- length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
88
- if slice_tag:
89
- print('jump empty segment')
90
- _audio = np.zeros(length)
91
- audio.extend(list(infer_tool.pad_array(_audio, length)))
92
- continue
93
- if per_size != 0:
94
- datas = infer_tool.split_list_by_n(data, per_size,lg_size)
95
- else:
96
- datas = [data]
97
- for k,dat in enumerate(datas):
98
- per_length = int(np.ceil(len(dat) / audio_sr * svc_model.target_sample)) if clip!=0 else length
99
- if clip!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
100
- # padd
101
- pad_len = int(audio_sr * pad_seconds)
102
- dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
103
- raw_path = io.BytesIO()
104
- soundfile.write(raw_path, dat, audio_sr, format="wav")
105
- raw_path.seek(0)
106
- out_audio, out_sr = svc_model.infer(spk, tran, raw_path,
107
- cluster_infer_ratio=cluster_infer_ratio,
108
- auto_predict_f0=auto_predict_f0,
109
- noice_scale=noice_scale,
110
- F0_mean_pooling = F0_mean_pooling
111
- )
112
- _audio = out_audio.cpu().numpy()
113
- pad_len = int(svc_model.target_sample * pad_seconds)
114
- _audio = _audio[pad_len:-pad_len]
115
- _audio = infer_tool.pad_array(_audio, per_length)
116
- if lg_size!=0 and k!=0:
117
- lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr != 1 else audio[-lg_size:]
118
- lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr != 1 else _audio[0:lg_size]
119
- lg_pre = lg1*(1-lg)+lg2*lg
120
- audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr != 1 else audio[0:-lg_size]
121
- audio.extend(lg_pre)
122
- _audio = _audio[lg_size_c_l+lg_size_r:] if lgr != 1 else _audio[lg_size:]
123
- audio.extend(list(_audio))
124
  key = "auto" if auto_predict_f0 else f"{tran}key"
125
  cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
126
- res_path = f'./results/{clean_name}_{key}_{spk}{cluster_name}.{wav_format}'
 
 
 
 
 
127
  soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
 
 
128
 
129
  if __name__ == '__main__':
130
  main()
 
2
  import logging
3
  import time
4
  from pathlib import Path
5
+ from spkmix import spk_mix_map
6
  import librosa
7
  import matplotlib.pyplot as plt
8
  import numpy as np
9
  import soundfile
 
10
  from inference import infer_tool
11
  from inference import slicer
12
  from inference.infer_tool import Svc
 
15
  chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
16
 
17
 
 
18
  def main():
19
  import argparse
20
 
21
  parser = argparse.ArgumentParser(description='sovits4 inference')
22
 
23
  # 一定要设置的部分
24
+ parser.add_argument('-m', '--model_path', type=str, default="logs/44k/", help='模型路径')
25
+ parser.add_argument('-c', '--config_path', type=str, default="configs/", help='配置文件路径')
26
  parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片,默认0为自动切片,单位为秒/s')
27
+ parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["test.wav"],
28
+ help='wav文件名列表,放在raw文件夹下')
29
  parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
30
+ parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['buyizi'], help='合成目标说话人名称')
31
 
32
  # 可选项部分
33
+ parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False,
34
+ help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
35
+ parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt",
36
+ help='聚类模型或特征检索索引路径,如果没有训练聚类或特征检索则随便填')
37
+ parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0,
38
+ help='聚类方案或特征检索占比,范围0-1,若没有训练聚类模型或特征检索则默认0即可')
39
+ parser.add_argument('-lg', '--linear_gradient', type=float, default=0,
40
+ help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
41
+ parser.add_argument('-f0p', '--f0_predictor', type=str, default="harvest",
42
+ help='选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意:crepe为原F0使用均值滤波器)')
43
+ parser.add_argument('-eh', '--enhance', action='store_true', default=False,
44
+ help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
45
+ parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False,
46
+ help='是否使用浅层扩散,使用后可解决一部分电音问题,默认关闭,该选项打开时,NSF_HIFIGAN增强器将会被禁止')
47
+ parser.add_argument('-usm', '--use_spk_mix', action='store_true', default=False, help='是否使用角色融合')
48
+ parser.add_argument('-lea', '--loudness_envelope_adjustment', type=float, default=1,
49
+ help='输入源响度包络替换输出响度包络融合比例,越靠近1越使用输出响度包络')
50
+ parser.add_argument('-fr', '--feature_retrieval', action='store_true', default=False,
51
+ help='是否使用特征检索,如果使用聚类模型将被禁用,且cm与cr参数将会变成特征检索的索引路径与混合比例')
52
+
53
+ # 浅扩散设置
54
+ parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt",
55
+ help='扩散模型路径')
56
+ parser.add_argument('-dc', '--diffusion_config_path', type=str, default="logs/44k/diffusion/config.yaml",
57
+ help='扩散模型配置文件路径')
58
+ parser.add_argument('-ks', '--k_step', type=int, default=100, help='扩散步数,越大越接近扩散模型的结果,默认100')
59
+ parser.add_argument('-se', '--second_encoding', action='store_true', default=False,
60
+ help='二次编码,浅扩散前会对原始音频进行二次编码,玄学选项,有时候效果好,有时候效果差')
61
+ parser.add_argument('-od', '--only_diffusion', action='store_true', default=False,
62
+ help='纯扩散模式,该模式不会加载sovits模型,以扩散模型推理')
63
 
64
  # 不用动的部分
65
+ parser.add_argument('-sd', '--slice_db', type=int, default=-40,
66
+ help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
67
  parser.add_argument('-d', '--device', type=str, default=None, help='推理设备,None则为自动选择cpu和gpu')
68
  parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='噪音级别,会影响咬字和音质,较为玄学')
69
+ parser.add_argument('-p', '--pad_seconds', type=float, default=0.5,
70
+ help='推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现')
71
  parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
72
+ parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75,
73
+ help='自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭')
74
+ parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0,
75
+ help='使增强器适应更高的音域(单位为半音数)|默认为0')
76
+ parser.add_argument('-ft', '--f0_filter_threshold', type=float, default=0.05,
77
+ help='F0过滤阈值,只有使用crepe时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音')
78
+
79
+ def preprocess_args(args1):
80
+ spk1 = args1.spk_list[0]
81
+ args1.model_path += f"{spk1}.pth"
82
+ args1.config_path += f"config_{spk1}.json"
83
+ args1.clip = 30
84
+
85
+ if spk1 == 'tomori':
86
+ args1.feature_retrieval = True
87
+ args1.cluster_model_path = "logs/44k/tomori_index.pkl"
88
+ args1.cluster_infer_ratio = 0.5
89
+ args1.f0_predictor = 'crepe'
90
+
91
+ return args1
92
 
93
  args = parser.parse_args()
94
+ args = preprocess_args(args)
95
 
 
 
96
  clean_names = args.clean_names
97
  trans = args.trans
98
  spk_list = args.spk_list
 
105
  clip = args.clip
106
  lg = args.linear_gradient
107
  lgr = args.linear_gradient_retain
108
+ f0p = args.f0_predictor
109
+ enhance = args.enhance
110
+ enhancer_adaptive_key = args.enhancer_adaptive_key
111
+ cr_threshold = args.f0_filter_threshold
112
+ diffusion_model_path = args.diffusion_model_path
113
+ diffusion_config_path = args.diffusion_config_path
114
+ k_step = args.k_step
115
+ only_diffusion = args.only_diffusion
116
+ shallow_diffusion = args.shallow_diffusion
117
+ use_spk_mix = args.use_spk_mix
118
+ second_encoding = args.second_encoding
119
+ loudness_envelope_adjustment = args.loudness_envelope_adjustment
120
+
121
+ svc_model = Svc(args.model_path,
122
+ args.config_path,
123
+ args.device,
124
+ args.cluster_model_path,
125
+ enhance,
126
+ diffusion_model_path,
127
+ diffusion_config_path,
128
+ shallow_diffusion,
129
+ only_diffusion,
130
+ use_spk_mix,
131
+ args.feature_retrieval)
132
+
133
+ infer_tool.mkdir(["raw", "results"])
134
+
135
+ if len(spk_mix_map) <= 1:
136
+ use_spk_mix = False
137
+ if use_spk_mix:
138
+ spk_list = [spk_mix_map]
139
 
140
  infer_tool.fill_a_to_b(trans, clean_names)
141
  for clean_name, tran in zip(clean_names, trans):
 
143
  if "." not in raw_audio_path:
144
  raw_audio_path += ".wav"
145
  infer_tool.format_wav(raw_audio_path)
 
 
 
 
 
 
 
 
 
 
146
  for spk in spk_list:
147
+ kwarg = {
148
+ "raw_audio_path": raw_audio_path,
149
+ "spk": spk,
150
+ "tran": tran,
151
+ "slice_db": slice_db,
152
+ "cluster_infer_ratio": cluster_infer_ratio,
153
+ "auto_predict_f0": auto_predict_f0,
154
+ "noice_scale": noice_scale,
155
+ "pad_seconds": pad_seconds,
156
+ "clip_seconds": clip,
157
+ "lg_num": lg,
158
+ "lgr_num": lgr,
159
+ "f0_predictor": f0p,
160
+ "enhancer_adaptive_key": enhancer_adaptive_key,
161
+ "cr_threshold": cr_threshold,
162
+ "k_step": k_step,
163
+ "use_spk_mix": use_spk_mix,
164
+ "second_encoding": second_encoding,
165
+ "loudness_envelope_adjustment": loudness_envelope_adjustment
166
+ }
167
+ audio = svc_model.slice_inference(**kwarg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  key = "auto" if auto_predict_f0 else f"{tran}key"
169
  cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
170
+ isdiffusion = "sovits"
171
+ if shallow_diffusion: isdiffusion = "sovdiff"
172
+ if only_diffusion: isdiffusion = "diff"
173
+ if use_spk_mix:
174
+ spk = "spk_mix"
175
+ res_path = f'results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}.{wav_format}'
176
  soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
177
+ svc_model.clear_empty()
178
+
179
 
180
  if __name__ == '__main__':
181
  main()
models.py CHANGED
@@ -13,111 +13,111 @@ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
 
14
  import utils
15
  from modules.commons import init_weights, get_padding
16
- from vdecoder.hifigan.models import Generator
17
  from utils import f0_to_coarse
18
 
19
  class ResidualCouplingBlock(nn.Module):
20
- def __init__(self,
21
- channels,
22
- hidden_channels,
23
- kernel_size,
24
- dilation_rate,
25
- n_layers,
26
- n_flows=4,
27
- gin_channels=0):
28
- super().__init__()
29
- self.channels = channels
30
- self.hidden_channels = hidden_channels
31
- self.kernel_size = kernel_size
32
- self.dilation_rate = dilation_rate
33
- self.n_layers = n_layers
34
- self.n_flows = n_flows
35
- self.gin_channels = gin_channels
36
-
37
- self.flows = nn.ModuleList()
38
- for i in range(n_flows):
39
- self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
40
- self.flows.append(modules.Flip())
41
-
42
- def forward(self, x, x_mask, g=None, reverse=False):
43
- if not reverse:
44
- for flow in self.flows:
45
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
46
- else:
47
- for flow in reversed(self.flows):
48
- x = flow(x, x_mask, g=g, reverse=reverse)
49
- return x
 
 
50
 
51
 
52
  class Encoder(nn.Module):
53
- def __init__(self,
54
- in_channels,
55
- out_channels,
56
- hidden_channels,
57
- kernel_size,
58
- dilation_rate,
59
- n_layers,
60
- gin_channels=0):
61
- super().__init__()
62
- self.in_channels = in_channels
63
- self.out_channels = out_channels
64
- self.hidden_channels = hidden_channels
65
- self.kernel_size = kernel_size
66
- self.dilation_rate = dilation_rate
67
- self.n_layers = n_layers
68
- self.gin_channels = gin_channels
69
-
70
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
71
- self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
72
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
73
-
74
- def forward(self, x, x_lengths, g=None):
75
- # print(x.shape,x_lengths.shape)
76
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
77
- x = self.pre(x) * x_mask
78
- x = self.enc(x, x_mask, g=g)
79
- stats = self.proj(x) * x_mask
80
- m, logs = torch.split(stats, self.out_channels, dim=1)
81
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
82
- return z, m, logs, x_mask
83
 
84
 
85
  class TextEncoder(nn.Module):
86
- def __init__(self,
87
- out_channels,
88
- hidden_channels,
89
- kernel_size,
90
- n_layers,
91
- gin_channels=0,
92
- filter_channels=None,
93
- n_heads=None,
94
- p_dropout=None):
95
- super().__init__()
96
- self.out_channels = out_channels
97
- self.hidden_channels = hidden_channels
98
- self.kernel_size = kernel_size
99
- self.n_layers = n_layers
100
- self.gin_channels = gin_channels
101
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
102
- self.f0_emb = nn.Embedding(256, hidden_channels)
103
-
104
- self.enc_ = attentions.Encoder(
105
- hidden_channels,
106
- filter_channels,
107
- n_heads,
108
- n_layers,
109
- kernel_size,
110
- p_dropout)
111
-
112
- def forward(self, x, x_mask, f0=None, noice_scale=1):
113
- x = x + self.f0_emb(f0).transpose(1,2)
114
- x = self.enc_(x * x_mask, x_mask)
115
- stats = self.proj(x) * x_mask
116
- m, logs = torch.split(stats, self.out_channels, dim=1)
117
- z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
118
-
119
- return z, m, logs, x_mask
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
 
123
  class DiscriminatorP(torch.nn.Module):
@@ -140,7 +140,7 @@ class DiscriminatorP(torch.nn.Module):
140
 
141
  # 1d to 2d
142
  b, c, t = x.shape
143
- if t % self.period != 0: # pad first
144
  n_pad = self.period - (t % self.period)
145
  x = F.pad(x, (0, n_pad), "reflect")
146
  t = t + n_pad
@@ -188,7 +188,7 @@ class DiscriminatorS(torch.nn.Module):
188
  class MultiPeriodDiscriminator(torch.nn.Module):
189
  def __init__(self, use_spectral_norm=False):
190
  super(MultiPeriodDiscriminator, self).__init__()
191
- periods = [2,3,5,7,11]
192
 
193
  discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
194
  discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
@@ -225,26 +225,26 @@ class SpeakerEncoder(torch.nn.Module):
225
 
226
  def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
227
  mel_slices = []
228
- for i in range(0, total_frames-partial_frames, partial_hop):
229
- mel_range = torch.arange(i, i+partial_frames)
230
  mel_slices.append(mel_range)
231
 
232
  return mel_slices
233
 
234
  def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
235
  mel_len = mel.size(1)
236
- last_mel = mel[:,-partial_frames:]
237
 
238
  if mel_len > partial_frames:
239
  mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
240
- mels = list(mel[:,s] for s in mel_slices)
241
  mels.append(last_mel)
242
  mels = torch.stack(tuple(mels), 0).squeeze(1)
243
 
244
  with torch.no_grad():
245
  partial_embeds = self(mels)
246
  embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
247
- #embed = embed / torch.linalg.norm(embed, 2)
248
  else:
249
  with torch.no_grad():
250
  embed = self(last_mel)
@@ -280,7 +280,7 @@ class F0Decoder(nn.Module):
280
  kernel_size,
281
  p_dropout)
282
  self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
283
- self.f0_prenet = nn.Conv1d(1, hidden_channels , 3, padding=1)
284
  self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
285
 
286
  def forward(self, x, norm_f0, x_mask, spk_emb=None):
@@ -295,126 +295,175 @@ class F0Decoder(nn.Module):
295
 
296
 
297
  class SynthesizerTrn(nn.Module):
298
- """
299
- Synthesizer for Training
300
- """
301
-
302
- def __init__(self,
303
- spec_channels,
304
- segment_size,
305
- inter_channels,
306
- hidden_channels,
307
- filter_channels,
308
- n_heads,
309
- n_layers,
310
- kernel_size,
311
- p_dropout,
312
- resblock,
313
- resblock_kernel_sizes,
314
- resblock_dilation_sizes,
315
- upsample_rates,
316
- upsample_initial_channel,
317
- upsample_kernel_sizes,
318
- gin_channels,
319
- ssl_dim,
320
- n_speakers,
321
- sampling_rate=44100,
322
- **kwargs):
323
-
324
- super().__init__()
325
- self.spec_channels = spec_channels
326
- self.inter_channels = inter_channels
327
- self.hidden_channels = hidden_channels
328
- self.filter_channels = filter_channels
329
- self.n_heads = n_heads
330
- self.n_layers = n_layers
331
- self.kernel_size = kernel_size
332
- self.p_dropout = p_dropout
333
- self.resblock = resblock
334
- self.resblock_kernel_sizes = resblock_kernel_sizes
335
- self.resblock_dilation_sizes = resblock_dilation_sizes
336
- self.upsample_rates = upsample_rates
337
- self.upsample_initial_channel = upsample_initial_channel
338
- self.upsample_kernel_sizes = upsample_kernel_sizes
339
- self.segment_size = segment_size
340
- self.gin_channels = gin_channels
341
- self.ssl_dim = ssl_dim
342
- self.emb_g = nn.Embedding(n_speakers, gin_channels)
343
-
344
- self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
345
-
346
- self.enc_p = TextEncoder(
347
- inter_channels,
348
- hidden_channels,
349
- filter_channels=filter_channels,
350
- n_heads=n_heads,
351
- n_layers=n_layers,
352
- kernel_size=kernel_size,
353
- p_dropout=p_dropout
354
- )
355
- hps = {
356
- "sampling_rate": sampling_rate,
357
- "inter_channels": inter_channels,
358
- "resblock": resblock,
359
- "resblock_kernel_sizes": resblock_kernel_sizes,
360
- "resblock_dilation_sizes": resblock_dilation_sizes,
361
- "upsample_rates": upsample_rates,
362
- "upsample_initial_channel": upsample_initial_channel,
363
- "upsample_kernel_sizes": upsample_kernel_sizes,
364
- "gin_channels": gin_channels,
365
- }
366
- self.dec = Generator(h=hps)
367
- self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
368
- self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
369
- self.f0_decoder = F0Decoder(
370
- 1,
371
- hidden_channels,
372
- filter_channels,
373
- n_heads,
374
- n_layers,
375
- kernel_size,
376
- p_dropout,
377
- spk_channels=gin_channels
378
- )
379
- self.emb_uv = nn.Embedding(2, hidden_channels)
380
-
381
- def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None):
382
- g = self.emb_g(g).transpose(1,2)
383
- # ssl prenet
384
- x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
385
- x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2)
386
-
387
- # f0 predict
388
- lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
389
- norm_lf0 = utils.normalize_f0(lf0, x_mask, uv)
390
- pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
391
-
392
- # encoder
393
- z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
394
- z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
395
-
396
- # flow
397
- z_p = self.flow(z, spec_mask, g=g)
398
- z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size)
399
-
400
- # nsf decoder
401
- o = self.dec(z_slice, g=g, f0=pitch_slice)
402
-
403
- return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0
404
-
405
- def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False):
406
- c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
407
- g = self.emb_g(g).transpose(1,2)
408
- x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
409
- x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2)
410
-
411
- if predict_f0:
 
 
 
 
 
412
  lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
413
- norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
414
  pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
415
- f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
416
 
417
- z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
418
- z = self.flow(z_p, c_mask, g=g, reverse=True)
419
- o = self.dec(z * c_mask, g=g, f0=f0)
420
- return o
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  import utils
15
  from modules.commons import init_weights, get_padding
 
16
  from utils import f0_to_coarse
17
 
18
  class ResidualCouplingBlock(nn.Module):
19
+ def __init__(self,
20
+ channels,
21
+ hidden_channels,
22
+ kernel_size,
23
+ dilation_rate,
24
+ n_layers,
25
+ n_flows=4,
26
+ gin_channels=0):
27
+ super().__init__()
28
+ self.channels = channels
29
+ self.hidden_channels = hidden_channels
30
+ self.kernel_size = kernel_size
31
+ self.dilation_rate = dilation_rate
32
+ self.n_layers = n_layers
33
+ self.n_flows = n_flows
34
+ self.gin_channels = gin_channels
35
+
36
+ self.flows = nn.ModuleList()
37
+ for i in range(n_flows):
38
+ self.flows.append(
39
+ modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
40
+ gin_channels=gin_channels, mean_only=True))
41
+ self.flows.append(modules.Flip())
42
+
43
+ def forward(self, x, x_mask, g=None, reverse=False):
44
+ if not reverse:
45
+ for flow in self.flows:
46
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
47
+ else:
48
+ for flow in reversed(self.flows):
49
+ x = flow(x, x_mask, g=g, reverse=reverse)
50
+ return x
51
 
52
 
53
  class Encoder(nn.Module):
54
+ def __init__(self,
55
+ in_channels,
56
+ out_channels,
57
+ hidden_channels,
58
+ kernel_size,
59
+ dilation_rate,
60
+ n_layers,
61
+ gin_channels=0):
62
+ super().__init__()
63
+ self.in_channels = in_channels
64
+ self.out_channels = out_channels
65
+ self.hidden_channels = hidden_channels
66
+ self.kernel_size = kernel_size
67
+ self.dilation_rate = dilation_rate
68
+ self.n_layers = n_layers
69
+ self.gin_channels = gin_channels
70
+
71
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
72
+ self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
73
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
74
+
75
+ def forward(self, x, x_lengths, g=None):
76
+ # print(x.shape,x_lengths.shape)
77
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
78
+ x = self.pre(x) * x_mask
79
+ x = self.enc(x, x_mask, g=g)
80
+ stats = self.proj(x) * x_mask
81
+ m, logs = torch.split(stats, self.out_channels, dim=1)
82
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
83
+ return z, m, logs, x_mask
84
 
85
 
86
  class TextEncoder(nn.Module):
87
+ def __init__(self,
88
+ out_channels,
89
+ hidden_channels,
90
+ kernel_size,
91
+ n_layers,
92
+ gin_channels=0,
93
+ filter_channels=None,
94
+ n_heads=None,
95
+ p_dropout=None):
96
+ super().__init__()
97
+ self.out_channels = out_channels
98
+ self.hidden_channels = hidden_channels
99
+ self.kernel_size = kernel_size
100
+ self.n_layers = n_layers
101
+ self.gin_channels = gin_channels
102
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
103
+ self.f0_emb = nn.Embedding(256, hidden_channels)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ self.enc_ = attentions.Encoder(
106
+ hidden_channels,
107
+ filter_channels,
108
+ n_heads,
109
+ n_layers,
110
+ kernel_size,
111
+ p_dropout)
112
+
113
+ def forward(self, x, x_mask, f0=None, noice_scale=1):
114
+ x = x + self.f0_emb(f0).transpose(1, 2)
115
+ x = self.enc_(x * x_mask, x_mask)
116
+ stats = self.proj(x) * x_mask
117
+ m, logs = torch.split(stats, self.out_channels, dim=1)
118
+ z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
119
+
120
+ return z, m, logs, x_mask
121
 
122
 
123
  class DiscriminatorP(torch.nn.Module):
 
140
 
141
  # 1d to 2d
142
  b, c, t = x.shape
143
+ if t % self.period != 0: # pad first
144
  n_pad = self.period - (t % self.period)
145
  x = F.pad(x, (0, n_pad), "reflect")
146
  t = t + n_pad
 
188
  class MultiPeriodDiscriminator(torch.nn.Module):
189
  def __init__(self, use_spectral_norm=False):
190
  super(MultiPeriodDiscriminator, self).__init__()
191
+ periods = [2, 3, 5, 7, 11]
192
 
193
  discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
194
  discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
 
225
 
226
  def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
227
  mel_slices = []
228
+ for i in range(0, total_frames - partial_frames, partial_hop):
229
+ mel_range = torch.arange(i, i + partial_frames)
230
  mel_slices.append(mel_range)
231
 
232
  return mel_slices
233
 
234
  def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
235
  mel_len = mel.size(1)
236
+ last_mel = mel[:, -partial_frames:]
237
 
238
  if mel_len > partial_frames:
239
  mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
240
+ mels = list(mel[:, s] for s in mel_slices)
241
  mels.append(last_mel)
242
  mels = torch.stack(tuple(mels), 0).squeeze(1)
243
 
244
  with torch.no_grad():
245
  partial_embeds = self(mels)
246
  embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
247
+ # embed = embed / torch.linalg.norm(embed, 2)
248
  else:
249
  with torch.no_grad():
250
  embed = self(last_mel)
 
280
  kernel_size,
281
  p_dropout)
282
  self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
283
+ self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
284
  self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
285
 
286
  def forward(self, x, norm_f0, x_mask, spk_emb=None):
 
295
 
296
 
297
  class SynthesizerTrn(nn.Module):
298
+ """
299
+ Synthesizer for Training
300
+ """
301
+
302
+ def __init__(self,
303
+ spec_channels,
304
+ segment_size,
305
+ inter_channels,
306
+ hidden_channels,
307
+ filter_channels,
308
+ n_heads,
309
+ n_layers,
310
+ kernel_size,
311
+ p_dropout,
312
+ resblock,
313
+ resblock_kernel_sizes,
314
+ resblock_dilation_sizes,
315
+ upsample_rates,
316
+ upsample_initial_channel,
317
+ upsample_kernel_sizes,
318
+ gin_channels,
319
+ ssl_dim,
320
+ n_speakers,
321
+ sampling_rate=44100,
322
+ vol_embedding=False,
323
+ vocoder_name = "nsf-hifigan",
324
+ **kwargs):
325
+
326
+ super().__init__()
327
+ self.spec_channels = spec_channels
328
+ self.inter_channels = inter_channels
329
+ self.hidden_channels = hidden_channels
330
+ self.filter_channels = filter_channels
331
+ self.n_heads = n_heads
332
+ self.n_layers = n_layers
333
+ self.kernel_size = kernel_size
334
+ self.p_dropout = p_dropout
335
+ self.resblock = resblock
336
+ self.resblock_kernel_sizes = resblock_kernel_sizes
337
+ self.resblock_dilation_sizes = resblock_dilation_sizes
338
+ self.upsample_rates = upsample_rates
339
+ self.upsample_initial_channel = upsample_initial_channel
340
+ self.upsample_kernel_sizes = upsample_kernel_sizes
341
+ self.segment_size = segment_size
342
+ self.gin_channels = gin_channels
343
+ self.ssl_dim = ssl_dim
344
+ self.vol_embedding = vol_embedding
345
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
346
+ if vol_embedding:
347
+ self.emb_vol = nn.Linear(1, hidden_channels)
348
+
349
+ self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
350
+
351
+ self.enc_p = TextEncoder(
352
+ inter_channels,
353
+ hidden_channels,
354
+ filter_channels=filter_channels,
355
+ n_heads=n_heads,
356
+ n_layers=n_layers,
357
+ kernel_size=kernel_size,
358
+ p_dropout=p_dropout
359
+ )
360
+ hps = {
361
+ "sampling_rate": sampling_rate,
362
+ "inter_channels": inter_channels,
363
+ "resblock": resblock,
364
+ "resblock_kernel_sizes": resblock_kernel_sizes,
365
+ "resblock_dilation_sizes": resblock_dilation_sizes,
366
+ "upsample_rates": upsample_rates,
367
+ "upsample_initial_channel": upsample_initial_channel,
368
+ "upsample_kernel_sizes": upsample_kernel_sizes,
369
+ "gin_channels": gin_channels,
370
+ }
371
+
372
+
373
+ if vocoder_name == "nsf-hifigan":
374
+ from vdecoder.hifigan.models import Generator
375
+ self.dec = Generator(h=hps)
376
+ elif vocoder_name == "nsf-snake-hifigan":
377
+ from vdecoder.hifiganwithsnake.models import Generator
378
+ self.dec = Generator(h=hps)
379
+ else:
380
+ print("[?] Unkown vocoder: use default(nsf-hifigan)")
381
+ from vdecoder.hifigan.models import Generator
382
+ self.dec = Generator(h=hps)
383
+
384
+ self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
385
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
386
+ self.f0_decoder = F0Decoder(
387
+ 1,
388
+ hidden_channels,
389
+ filter_channels,
390
+ n_heads,
391
+ n_layers,
392
+ kernel_size,
393
+ p_dropout,
394
+ spk_channels=gin_channels
395
+ )
396
+ self.emb_uv = nn.Embedding(2, hidden_channels)
397
+ self.character_mix = False
398
+
399
+ def EnableCharacterMix(self, n_speakers_map, device):
400
+ self.speaker_map = torch.zeros((n_speakers_map, 1, 1, self.gin_channels)).to(device)
401
+ for i in range(n_speakers_map):
402
+ self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]).to(device))
403
+ self.speaker_map = self.speaker_map.unsqueeze(0).to(device)
404
+ self.character_mix = True
405
+
406
+ def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None, vol = None):
407
+ g = self.emb_g(g).transpose(1,2)
408
+
409
+ # vol proj
410
+ vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
411
+
412
+ # ssl prenet
413
+ x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
414
+ x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol
415
+
416
+ # f0 predict
417
  lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
418
+ norm_lf0 = utils.normalize_f0(lf0, x_mask, uv)
419
  pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
 
420
 
421
+ # encoder
422
+ z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
423
+ z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
424
+
425
+ # flow
426
+ z_p = self.flow(z, spec_mask, g=g)
427
+ z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size)
428
+
429
+ # nsf decoder
430
+ o = self.dec(z_slice, g=g, f0=pitch_slice)
431
+
432
+ return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0
433
+
434
+ def infer(self, c, f0, uv, g=None, noice_scale=0.35, seed=52468, predict_f0=False, vol = None):
435
+
436
+ if c.device == torch.device("cuda"):
437
+ torch.cuda.manual_seed_all(seed)
438
+ else:
439
+ torch.manual_seed(seed)
440
+
441
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
442
+
443
+ if self.character_mix and len(g) > 1: # [N, S] * [S, B, 1, H]
444
+ g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
445
+ g = g * self.speaker_map # [N, S, B, 1, H]
446
+ g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
447
+ g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
448
+ else:
449
+ if g.dim() == 1:
450
+ g = g.unsqueeze(0)
451
+ g = self.emb_g(g).transpose(1, 2)
452
+
453
+ x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
454
+ # vol proj
455
+ vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
456
+
457
+ x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol
458
+
459
+ if predict_f0:
460
+ lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
461
+ norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
462
+ pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
463
+ f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
464
+
465
+ z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
466
+ z = self.flow(z_p, c_mask, g=g, reverse=True)
467
+ o = self.dec(z * c_mask, g=g, f0=f0)
468
+ return o,f0
469
+
utils.py CHANGED
@@ -6,18 +6,21 @@ import argparse
6
  import logging
7
  import json
8
  import subprocess
 
9
  import random
10
-
11
  import librosa
12
  import numpy as np
13
  from scipy.io.wavfile import read
14
  import torch
15
  from torch.nn import functional as F
16
  from modules.commons import sequence_mask
17
- from hubert import hubert_model
 
 
18
  MATPLOTLIB_FLAG = False
19
 
20
- logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
21
  logger = logging
22
 
23
  f0_bin = 256
@@ -26,26 +29,6 @@ f0_min = 50.0
26
  f0_mel_min = 1127 * np.log(1 + f0_min / 700)
27
  f0_mel_max = 1127 * np.log(1 + f0_max / 700)
28
 
29
-
30
- # def normalize_f0(f0, random_scale=True):
31
- # f0_norm = f0.clone() # create a copy of the input Tensor
32
- # batch_size, _, frame_length = f0_norm.shape
33
- # for i in range(batch_size):
34
- # means = torch.mean(f0_norm[i, 0, :])
35
- # if random_scale:
36
- # factor = random.uniform(0.8, 1.2)
37
- # else:
38
- # factor = 1
39
- # f0_norm[i, 0, :] = (f0_norm[i, 0, :] - means) * factor
40
- # return f0_norm
41
- # def normalize_f0(f0, random_scale=True):
42
- # means = torch.mean(f0[:, 0, :], dim=1, keepdim=True)
43
- # if random_scale:
44
- # factor = torch.Tensor(f0.shape[0],1).uniform_(0.8, 1.2).to(f0.device)
45
- # else:
46
- # factor = torch.ones(f0.shape[0], 1, 1).to(f0.device)
47
- # f0_norm = (f0 - means.unsqueeze(-1)) * factor.unsqueeze(-1)
48
- # return f0_norm
49
  def normalize_f0(f0, x_mask, uv, random_scale=True):
50
  # calculate means based on x_mask
51
  uv_sum = torch.sum(uv, dim=1, keepdim=True)
@@ -62,7 +45,6 @@ def normalize_f0(f0, x_mask, uv, random_scale=True):
62
  exit(0)
63
  return f0_norm * x_mask
64
 
65
-
66
  def plot_data_to_numpy(x, y):
67
  global MATPLOTLIB_FLAG
68
  if not MATPLOTLIB_FLAG:
@@ -86,87 +68,6 @@ def plot_data_to_numpy(x, y):
86
  return data
87
 
88
 
89
-
90
- def interpolate_f0(f0):
91
- '''
92
- 对F0进行插值处理
93
- '''
94
-
95
- data = np.reshape(f0, (f0.size, 1))
96
-
97
- vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
98
- vuv_vector[data > 0.0] = 1.0
99
- vuv_vector[data <= 0.0] = 0.0
100
-
101
- ip_data = data
102
-
103
- frame_number = data.size
104
- last_value = 0.0
105
- for i in range(frame_number):
106
- if data[i] <= 0.0:
107
- j = i + 1
108
- for j in range(i + 1, frame_number):
109
- if data[j] > 0.0:
110
- break
111
- if j < frame_number - 1:
112
- if last_value > 0.0:
113
- step = (data[j] - data[i - 1]) / float(j - i)
114
- for k in range(i, j):
115
- ip_data[k] = data[i - 1] + step * (k - i + 1)
116
- else:
117
- for k in range(i, j):
118
- ip_data[k] = data[j]
119
- else:
120
- for k in range(i, frame_number):
121
- ip_data[k] = last_value
122
- else:
123
- ip_data[i] = data[i]
124
- last_value = data[i]
125
-
126
- return ip_data[:,0], vuv_vector[:,0]
127
-
128
-
129
- def compute_f0_parselmouth(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512):
130
- import parselmouth
131
- x = wav_numpy
132
- if p_len is None:
133
- p_len = x.shape[0]//hop_length
134
- else:
135
- assert abs(p_len-x.shape[0]//hop_length) < 4, "pad length error"
136
- time_step = hop_length / sampling_rate * 1000
137
- f0_min = 50
138
- f0_max = 1100
139
- f0 = parselmouth.Sound(x, sampling_rate).to_pitch_ac(
140
- time_step=time_step / 1000, voicing_threshold=0.6,
141
- pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
142
-
143
- pad_size=(p_len - len(f0) + 1) // 2
144
- if(pad_size>0 or p_len - len(f0) - pad_size>0):
145
- f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
146
- return f0
147
-
148
- def resize_f0(x, target_len):
149
- source = np.array(x)
150
- source[source<0.001] = np.nan
151
- target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source)
152
- res = np.nan_to_num(target)
153
- return res
154
-
155
- def compute_f0_dio(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512):
156
- import pyworld
157
- if p_len is None:
158
- p_len = wav_numpy.shape[0]//hop_length
159
- f0, t = pyworld.dio(
160
- wav_numpy.astype(np.double),
161
- fs=sampling_rate,
162
- f0_ceil=800,
163
- frame_period=1000 * hop_length / sampling_rate,
164
- )
165
- f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate)
166
- for index, pitch in enumerate(f0):
167
- f0[index] = round(pitch, 1)
168
- return resize_f0(f0, p_len)
169
-
170
  def f0_to_coarse(f0):
171
  is_torch = isinstance(f0, torch.Tensor)
172
  f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
@@ -174,48 +75,73 @@ def f0_to_coarse(f0):
174
 
175
  f0_mel[f0_mel <= 1] = 1
176
  f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
177
- f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
178
  assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
179
  return f0_coarse
180
 
181
-
182
- def get_hubert_model():
183
- vec_path = "hubert/checkpoint_best_legacy_500.pt"
184
- print("load model(s) from {}".format(vec_path))
185
- from fairseq import checkpoint_utils
186
- models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
187
- [vec_path],
188
- suffix="",
189
- )
190
- model = models[0]
191
- model.eval()
192
- return model
193
-
194
- def get_hubert_content(hmodel, wav_16k_tensor):
195
- feats = wav_16k_tensor
196
- if feats.dim() == 2: # double channels
197
- feats = feats.mean(-1)
198
- assert feats.dim() == 1, feats.dim()
199
- feats = feats.view(1, -1)
200
- padding_mask = torch.BoolTensor(feats.shape).fill_(False)
201
- inputs = {
202
- "source": feats.to(wav_16k_tensor.device),
203
- "padding_mask": padding_mask.to(wav_16k_tensor.device),
204
- "output_layer": 9, # layer 9
205
- }
206
- with torch.no_grad():
207
- logits = hmodel.extract_features(**inputs)
208
- feats = hmodel.final_proj(logits[0])
209
- return feats.transpose(1, 2)
210
-
211
-
212
  def get_content(cmodel, y):
213
  with torch.no_grad():
214
  c = cmodel.extract_features(y.squeeze(1))[0]
215
  c = c.transpose(1, 2)
216
  return c
217
 
218
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):
221
  assert os.path.isfile(checkpoint_path)
@@ -244,6 +170,7 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False
244
  model.module.load_state_dict(new_state_dict)
245
  else:
246
  model.load_state_dict(new_state_dict)
 
247
  logger.info("Loaded checkpoint '{}' (iteration {})".format(
248
  checkpoint_path, iteration))
249
  return model, optimizer, learning_rate, iteration
@@ -368,7 +295,7 @@ def load_filepaths_and_text(filename, split="|"):
368
 
369
  def get_hparams(init=True):
370
  parser = argparse.ArgumentParser()
371
- parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
372
  help='JSON file for configuration')
373
  parser.add_argument('-m', '--model', type=str, required=True,
374
  help='Model name')
@@ -411,7 +338,6 @@ def get_hparams_from_file(config_path):
411
  with open(config_path, "r") as f:
412
  data = f.read()
413
  config = json.loads(data)
414
-
415
  hparams =HParams(**config)
416
  return hparams
417
 
@@ -468,6 +394,73 @@ def repeat_expand_2d(content, target_len):
468
  return target
469
 
470
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
  class HParams():
472
  def __init__(self, **kwargs):
473
  for k, v in kwargs.items():
@@ -499,3 +492,19 @@ class HParams():
499
  def __repr__(self):
500
  return self.__dict__.__repr__()
501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import logging
7
  import json
8
  import subprocess
9
+ import warnings
10
  import random
11
+ import functools
12
  import librosa
13
  import numpy as np
14
  from scipy.io.wavfile import read
15
  import torch
16
  from torch.nn import functional as F
17
  from modules.commons import sequence_mask
18
+ import faiss
19
+ import tqdm
20
+
21
  MATPLOTLIB_FLAG = False
22
 
23
+ logging.basicConfig(stream=sys.stdout, level=logging.WARN)
24
  logger = logging
25
 
26
  f0_bin = 256
 
29
  f0_mel_min = 1127 * np.log(1 + f0_min / 700)
30
  f0_mel_max = 1127 * np.log(1 + f0_max / 700)
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def normalize_f0(f0, x_mask, uv, random_scale=True):
33
  # calculate means based on x_mask
34
  uv_sum = torch.sum(uv, dim=1, keepdim=True)
 
45
  exit(0)
46
  return f0_norm * x_mask
47
 
 
48
  def plot_data_to_numpy(x, y):
49
  global MATPLOTLIB_FLAG
50
  if not MATPLOTLIB_FLAG:
 
68
  return data
69
 
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def f0_to_coarse(f0):
72
  is_torch = isinstance(f0, torch.Tensor)
73
  f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
 
75
 
76
  f0_mel[f0_mel <= 1] = 1
77
  f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
78
+ f0_coarse = (f0_mel + 0.5).int() if is_torch else np.rint(f0_mel).astype(np.int)
79
  assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
80
  return f0_coarse
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def get_content(cmodel, y):
83
  with torch.no_grad():
84
  c = cmodel.extract_features(y.squeeze(1))[0]
85
  c = c.transpose(1, 2)
86
  return c
87
 
88
+ def get_f0_predictor(f0_predictor,hop_length,sampling_rate,**kargs):
89
+ if f0_predictor == "pm":
90
+ from modules.F0Predictor.PMF0Predictor import PMF0Predictor
91
+ f0_predictor_object = PMF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate)
92
+ elif f0_predictor == "crepe":
93
+ from modules.F0Predictor.CrepeF0Predictor import CrepeF0Predictor
94
+ f0_predictor_object = CrepeF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate,device=kargs["device"],threshold=kargs["threshold"])
95
+ elif f0_predictor == "harvest":
96
+ from modules.F0Predictor.HarvestF0Predictor import HarvestF0Predictor
97
+ f0_predictor_object = HarvestF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate)
98
+ elif f0_predictor == "dio":
99
+ from modules.F0Predictor.DioF0Predictor import DioF0Predictor
100
+ f0_predictor_object = DioF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate)
101
+ else:
102
+ raise Exception("Unknown f0 predictor")
103
+ return f0_predictor_object
104
+
105
+ def get_speech_encoder(speech_encoder,device=None,**kargs):
106
+ if speech_encoder == "vec768l12":
107
+ from vencoder.ContentVec768L12 import ContentVec768L12
108
+ speech_encoder_object = ContentVec768L12(device = device)
109
+ elif speech_encoder == "vec256l9":
110
+ from vencoder.ContentVec256L9 import ContentVec256L9
111
+ speech_encoder_object = ContentVec256L9(device = device)
112
+ elif speech_encoder == "vec256l9-onnx":
113
+ from vencoder.ContentVec256L9_Onnx import ContentVec256L9_Onnx
114
+ speech_encoder_object = ContentVec256L9_Onnx(device = device)
115
+ elif speech_encoder == "vec256l12-onnx":
116
+ from vencoder.ContentVec256L12_Onnx import ContentVec256L12_Onnx
117
+ speech_encoder_object = ContentVec256L12_Onnx(device = device)
118
+ elif speech_encoder == "vec768l9-onnx":
119
+ from vencoder.ContentVec768L9_Onnx import ContentVec768L9_Onnx
120
+ speech_encoder_object = ContentVec768L9_Onnx(device = device)
121
+ elif speech_encoder == "vec768l12-onnx":
122
+ from vencoder.ContentVec768L12_Onnx import ContentVec768L12_Onnx
123
+ speech_encoder_object = ContentVec768L12_Onnx(device = device)
124
+ elif speech_encoder == "hubertsoft-onnx":
125
+ from vencoder.HubertSoft_Onnx import HubertSoft_Onnx
126
+ speech_encoder_object = HubertSoft_Onnx(device = device)
127
+ elif speech_encoder == "hubertsoft":
128
+ from vencoder.HubertSoft import HubertSoft
129
+ speech_encoder_object = HubertSoft(device = device)
130
+ elif speech_encoder == "whisper-ppg":
131
+ from vencoder.WhisperPPG import WhisperPPG
132
+ speech_encoder_object = WhisperPPG(device = device)
133
+ elif speech_encoder == "cnhubertlarge":
134
+ from vencoder.CNHubertLarge import CNHubertLarge
135
+ speech_encoder_object = CNHubertLarge(device = device)
136
+ elif speech_encoder == "dphubert":
137
+ from vencoder.DPHubert import DPHubert
138
+ speech_encoder_object = DPHubert(device = device)
139
+ elif speech_encoder == "whisper-ppg-large":
140
+ from vencoder.WhisperPPGLarge import WhisperPPGLarge
141
+ speech_encoder_object = WhisperPPGLarge(device = device)
142
+ else:
143
+ raise Exception("Unknown speech encoder")
144
+ return speech_encoder_object
145
 
146
  def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):
147
  assert os.path.isfile(checkpoint_path)
 
170
  model.module.load_state_dict(new_state_dict)
171
  else:
172
  model.load_state_dict(new_state_dict)
173
+ print("load ")
174
  logger.info("Loaded checkpoint '{}' (iteration {})".format(
175
  checkpoint_path, iteration))
176
  return model, optimizer, learning_rate, iteration
 
295
 
296
  def get_hparams(init=True):
297
  parser = argparse.ArgumentParser()
298
+ parser.add_argument('-c', '--config', type=str, default="./configs/config.json",
299
  help='JSON file for configuration')
300
  parser.add_argument('-m', '--model', type=str, required=True,
301
  help='Model name')
 
338
  with open(config_path, "r") as f:
339
  data = f.read()
340
  config = json.loads(data)
 
341
  hparams =HParams(**config)
342
  return hparams
343
 
 
394
  return target
395
 
396
 
397
+ def mix_model(model_paths,mix_rate,mode):
398
+ mix_rate = torch.FloatTensor(mix_rate)/100
399
+ model_tem = torch.load(model_paths[0])
400
+ models = [torch.load(path)["model"] for path in model_paths]
401
+ if mode == 0:
402
+ mix_rate = F.softmax(mix_rate,dim=0)
403
+ for k in model_tem["model"].keys():
404
+ model_tem["model"][k] = torch.zeros_like(model_tem["model"][k])
405
+ for i,model in enumerate(models):
406
+ model_tem["model"][k] += model[k]*mix_rate[i]
407
+ torch.save(model_tem,os.path.join(os.path.curdir,"output.pth"))
408
+ return os.path.join(os.path.curdir,"output.pth")
409
+
410
+ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 from RVC
411
+ # print(data1.max(),data2.max())
412
+ rms1 = librosa.feature.rms(
413
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
414
+ ) # 每半秒一个点
415
+ rms2 = librosa.feature.rms(y=data2.detach().cpu().numpy(), frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
416
+ rms1 = torch.from_numpy(rms1).to(data2.device)
417
+ rms1 = F.interpolate(
418
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
419
+ ).squeeze()
420
+ rms2 = torch.from_numpy(rms2).to(data2.device)
421
+ rms2 = F.interpolate(
422
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
423
+ ).squeeze()
424
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
425
+ data2 *= (
426
+ torch.pow(rms1, torch.tensor(1 - rate))
427
+ * torch.pow(rms2, torch.tensor(rate - 1))
428
+ )
429
+ return data2
430
+
431
+ def train_index(spk_name,root_dir = "dataset/44k/"): #from: RVC https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI
432
+ print("The feature index is constructing.")
433
+ exp_dir = os.path.join(root_dir,spk_name)
434
+ listdir_res = []
435
+ for file in os.listdir(exp_dir):
436
+ if ".wav.soft.pt" in file:
437
+ listdir_res.append(os.path.join(exp_dir,file))
438
+ if len(listdir_res) == 0:
439
+ raise Exception("You need to run preprocess_hubert_f0.py!")
440
+ npys = []
441
+ for name in sorted(listdir_res):
442
+ phone = torch.load(name)[0].transpose(-1,-2).numpy()
443
+ npys.append(phone)
444
+ big_npy = np.concatenate(npys, 0)
445
+ big_npy_idx = np.arange(big_npy.shape[0])
446
+ np.random.shuffle(big_npy_idx)
447
+ big_npy = big_npy[big_npy_idx]
448
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
449
+ index = faiss.index_factory(big_npy.shape[1] , "IVF%s,Flat" % n_ivf)
450
+ index_ivf = faiss.extract_index_ivf(index) #
451
+ index_ivf.nprobe = 1
452
+ index.train(big_npy)
453
+ batch_size_add = 8192
454
+ for i in range(0, big_npy.shape[0], batch_size_add):
455
+ index.add(big_npy[i : i + batch_size_add])
456
+ # faiss.write_index(
457
+ # index,
458
+ # f"added_{spk_name}.index"
459
+ # )
460
+ print("Successfully build index")
461
+ return index
462
+
463
+
464
  class HParams():
465
  def __init__(self, **kwargs):
466
  for k, v in kwargs.items():
 
492
  def __repr__(self):
493
  return self.__dict__.__repr__()
494
 
495
+ def get(self,index):
496
+ return self.__dict__.get(index)
497
+
498
+ class Volume_Extractor:
499
+ def __init__(self, hop_size = 512):
500
+ self.hop_size = hop_size
501
+
502
+ def extract(self, audio): # audio: 2d tensor array
503
+ if not isinstance(audio,torch.Tensor):
504
+ audio = torch.Tensor(audio)
505
+ n_frames = int(audio.size(-1) // self.hop_size)
506
+ audio2 = audio ** 2
507
+ audio2 = torch.nn.functional.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode = 'reflect')
508
+ volume = torch.FloatTensor([torch.mean(audio2[:,int(n * self.hop_size) : int((n + 1) * self.hop_size)]) for n in range(n_frames)])
509
+ volume = torch.sqrt(volume)
510
+ return volume