Spaces:
Running
Running
Katock
commited on
Commit
·
f85ad87
1
Parent(s):
040c3ba
4.1
Browse files- data_utils.py +44 -15
- inference_main.py +121 -70
- models.py +272 -223
- utils.py +149 -140
data_utils.py
CHANGED
@@ -7,7 +7,7 @@ import torch.utils.data
|
|
7 |
|
8 |
import modules.commons as commons
|
9 |
import utils
|
10 |
-
from modules.mel_processing import spectrogram_torch, spec_to_mel_torch
|
11 |
from utils import load_wav_to_torch, load_filepaths_and_text
|
12 |
|
13 |
# import h5py
|
@@ -23,8 +23,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
23 |
3) computes spectrograms from audio files.
|
24 |
"""
|
25 |
|
26 |
-
def __init__(self, audiopaths, hparams, all_in_mem: bool = False):
|
27 |
self.audiopaths = load_filepaths_and_text(audiopaths)
|
|
|
28 |
self.max_wav_value = hparams.data.max_wav_value
|
29 |
self.sampling_rate = hparams.data.sampling_rate
|
30 |
self.filter_length = hparams.data.filter_length
|
@@ -34,7 +35,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
34 |
self.use_sr = hparams.train.use_sr
|
35 |
self.spec_len = hparams.train.max_speclen
|
36 |
self.spk_map = hparams.spk
|
37 |
-
|
|
|
38 |
random.seed(1234)
|
39 |
random.shuffle(self.audiopaths)
|
40 |
|
@@ -65,34 +67,55 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
65 |
spk = filename.split("/")[-2]
|
66 |
spk = torch.LongTensor([self.spk_map[spk]])
|
67 |
|
68 |
-
f0 = np.load(filename + ".f0.npy")
|
69 |
-
|
70 |
-
f0 = torch.FloatTensor(f0)
|
71 |
-
uv = torch.FloatTensor(uv)
|
72 |
|
73 |
c = torch.load(filename+ ".soft.pt")
|
74 |
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0])
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
lmin = min(c.size(-1), spec.size(-1))
|
78 |
assert abs(c.size(-1) - spec.size(-1)) < 3, (c.size(-1), spec.size(-1), f0.shape, filename)
|
79 |
assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
|
80 |
spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
|
81 |
audio_norm = audio_norm[:, :lmin * self.hop_length]
|
|
|
|
|
|
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
def random_slice(self, c, f0, spec, audio_norm, spk, uv):
|
86 |
# if spec.shape[1] < 30:
|
87 |
# print("skip too short audio:", filename)
|
88 |
# return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
if spec.shape[1] > 800:
|
90 |
start = random.randint(0, spec.shape[1]-800)
|
91 |
end = start + 790
|
92 |
spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
|
93 |
audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
|
94 |
-
|
95 |
-
|
|
|
96 |
|
97 |
def __getitem__(self, index):
|
98 |
if self.all_in_mem:
|
@@ -124,12 +147,14 @@ class TextAudioCollate:
|
|
124 |
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
125 |
spkids = torch.LongTensor(len(batch), 1)
|
126 |
uv_padded = torch.FloatTensor(len(batch), max_c_len)
|
|
|
127 |
|
128 |
c_padded.zero_()
|
129 |
spec_padded.zero_()
|
130 |
f0_padded.zero_()
|
131 |
wav_padded.zero_()
|
132 |
uv_padded.zero_()
|
|
|
133 |
|
134 |
for i in range(len(ids_sorted_decreasing)):
|
135 |
row = batch[ids_sorted_decreasing[i]]
|
@@ -151,5 +176,9 @@ class TextAudioCollate:
|
|
151 |
|
152 |
uv = row[5]
|
153 |
uv_padded[i, :uv.size(0)] = uv
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
import modules.commons as commons
|
9 |
import utils
|
10 |
+
from modules.mel_processing import spectrogram_torch, spec_to_mel_torch, spectrogram_torch
|
11 |
from utils import load_wav_to_torch, load_filepaths_and_text
|
12 |
|
13 |
# import h5py
|
|
|
23 |
3) computes spectrograms from audio files.
|
24 |
"""
|
25 |
|
26 |
+
def __init__(self, audiopaths, hparams, all_in_mem: bool = False, vol_aug: bool = True):
|
27 |
self.audiopaths = load_filepaths_and_text(audiopaths)
|
28 |
+
self.hparams = hparams
|
29 |
self.max_wav_value = hparams.data.max_wav_value
|
30 |
self.sampling_rate = hparams.data.sampling_rate
|
31 |
self.filter_length = hparams.data.filter_length
|
|
|
35 |
self.use_sr = hparams.train.use_sr
|
36 |
self.spec_len = hparams.train.max_speclen
|
37 |
self.spk_map = hparams.spk
|
38 |
+
self.vol_emb = hparams.model.vol_embedding
|
39 |
+
self.vol_aug = hparams.train.vol_aug and vol_aug
|
40 |
random.seed(1234)
|
41 |
random.shuffle(self.audiopaths)
|
42 |
|
|
|
67 |
spk = filename.split("/")[-2]
|
68 |
spk = torch.LongTensor([self.spk_map[spk]])
|
69 |
|
70 |
+
f0, uv = np.load(filename + ".f0.npy",allow_pickle=True)
|
71 |
+
|
72 |
+
f0 = torch.FloatTensor(np.array(f0,dtype=float))
|
73 |
+
uv = torch.FloatTensor(np.array(uv,dtype=float))
|
74 |
|
75 |
c = torch.load(filename+ ".soft.pt")
|
76 |
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0])
|
77 |
+
if self.vol_emb:
|
78 |
+
volume_path = filename + ".vol.npy"
|
79 |
+
volume = np.load(volume_path)
|
80 |
+
volume = torch.from_numpy(volume).float()
|
81 |
+
else:
|
82 |
+
volume = None
|
83 |
|
84 |
lmin = min(c.size(-1), spec.size(-1))
|
85 |
assert abs(c.size(-1) - spec.size(-1)) < 3, (c.size(-1), spec.size(-1), f0.shape, filename)
|
86 |
assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
|
87 |
spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
|
88 |
audio_norm = audio_norm[:, :lmin * self.hop_length]
|
89 |
+
if volume!= None:
|
90 |
+
volume = volume[:lmin]
|
91 |
+
return c, f0, spec, audio_norm, spk, uv, volume
|
92 |
|
93 |
+
def random_slice(self, c, f0, spec, audio_norm, spk, uv, volume):
|
|
|
|
|
94 |
# if spec.shape[1] < 30:
|
95 |
# print("skip too short audio:", filename)
|
96 |
# return None
|
97 |
+
|
98 |
+
if random.choice([True, False]) and self.vol_aug and volume!=None:
|
99 |
+
max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
|
100 |
+
max_shift = min(1, np.log10(1/max_amp))
|
101 |
+
log10_vol_shift = random.uniform(-1, max_shift)
|
102 |
+
audio_norm = audio_norm * (10 ** log10_vol_shift)
|
103 |
+
volume = volume * (10 ** log10_vol_shift)
|
104 |
+
spec = spectrogram_torch(audio_norm,
|
105 |
+
self.hparams.data.filter_length,
|
106 |
+
self.hparams.data.sampling_rate,
|
107 |
+
self.hparams.data.hop_length,
|
108 |
+
self.hparams.data.win_length,
|
109 |
+
center=False)[0]
|
110 |
+
|
111 |
if spec.shape[1] > 800:
|
112 |
start = random.randint(0, spec.shape[1]-800)
|
113 |
end = start + 790
|
114 |
spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
|
115 |
audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
|
116 |
+
if volume !=None:
|
117 |
+
volume = volume[start:end]
|
118 |
+
return c, f0, spec, audio_norm, spk, uv,volume
|
119 |
|
120 |
def __getitem__(self, index):
|
121 |
if self.all_in_mem:
|
|
|
147 |
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
148 |
spkids = torch.LongTensor(len(batch), 1)
|
149 |
uv_padded = torch.FloatTensor(len(batch), max_c_len)
|
150 |
+
volume_padded = torch.FloatTensor(len(batch), max_c_len)
|
151 |
|
152 |
c_padded.zero_()
|
153 |
spec_padded.zero_()
|
154 |
f0_padded.zero_()
|
155 |
wav_padded.zero_()
|
156 |
uv_padded.zero_()
|
157 |
+
volume_padded.zero_()
|
158 |
|
159 |
for i in range(len(ids_sorted_decreasing)):
|
160 |
row = batch[ids_sorted_decreasing[i]]
|
|
|
176 |
|
177 |
uv = row[5]
|
178 |
uv_padded[i, :uv.size(0)] = uv
|
179 |
+
volume = row[6]
|
180 |
+
if volume != None:
|
181 |
+
volume_padded[i, :volume.size(0)] = volume
|
182 |
+
else :
|
183 |
+
volume_padded = None
|
184 |
+
return c_padded, f0_padded, spec_padded, wav_padded, spkids, lengths, uv_padded, volume_padded
|
inference_main.py
CHANGED
@@ -2,12 +2,11 @@ import io
|
|
2 |
import logging
|
3 |
import time
|
4 |
from pathlib import Path
|
5 |
-
|
6 |
import librosa
|
7 |
import matplotlib.pyplot as plt
|
8 |
import numpy as np
|
9 |
import soundfile
|
10 |
-
|
11 |
from inference import infer_tool
|
12 |
from inference import slicer
|
13 |
from inference.infer_tool import Svc
|
@@ -16,39 +15,84 @@ logging.getLogger('numba').setLevel(logging.WARNING)
|
|
16 |
chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
|
17 |
|
18 |
|
19 |
-
|
20 |
def main():
|
21 |
import argparse
|
22 |
|
23 |
parser = argparse.ArgumentParser(description='sovits4 inference')
|
24 |
|
25 |
# 一定要设置的部分
|
26 |
-
parser.add_argument('-m', '--model_path', type=str, default="logs/44k/
|
27 |
-
parser.add_argument('-c', '--config_path', type=str, default="configs/
|
28 |
parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片,默认0为自动切片,单位为秒/s')
|
29 |
-
parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["
|
|
|
30 |
parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
|
31 |
-
parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['
|
32 |
|
33 |
# 可选项部分
|
34 |
-
parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False,
|
35 |
-
|
36 |
-
parser.add_argument('-
|
37 |
-
|
38 |
-
parser.add_argument('-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
# 不用动的部分
|
41 |
-
parser.add_argument('-sd', '--slice_db', type=int, default=-40,
|
|
|
42 |
parser.add_argument('-d', '--device', type=str, default=None, help='推理设备,None则为自动选择cpu和gpu')
|
43 |
parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='噪音级别,会影响咬字和音质,较为玄学')
|
44 |
-
parser.add_argument('-p', '--pad_seconds', type=float, default=0.5,
|
|
|
45 |
parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
|
46 |
-
parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
args = parser.parse_args()
|
|
|
49 |
|
50 |
-
svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path)
|
51 |
-
infer_tool.mkdir(["raw", "results"])
|
52 |
clean_names = args.clean_names
|
53 |
trans = args.trans
|
54 |
spk_list = args.spk_list
|
@@ -61,7 +105,37 @@ def main():
|
|
61 |
clip = args.clip
|
62 |
lg = args.linear_gradient
|
63 |
lgr = args.linear_gradient_retain
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
infer_tool.fill_a_to_b(trans, clean_names)
|
67 |
for clean_name, tran in zip(clean_names, trans):
|
@@ -69,62 +143,39 @@ def main():
|
|
69 |
if "." not in raw_audio_path:
|
70 |
raw_audio_path += ".wav"
|
71 |
infer_tool.format_wav(raw_audio_path)
|
72 |
-
wav_path = Path(raw_audio_path).with_suffix('.wav')
|
73 |
-
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
74 |
-
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
|
75 |
-
per_size = int(clip*audio_sr)
|
76 |
-
lg_size = int(lg*audio_sr)
|
77 |
-
lg_size_r = int(lg_size*lgr)
|
78 |
-
lg_size_c_l = (lg_size-lg_size_r)//2
|
79 |
-
lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
|
80 |
-
lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
|
81 |
-
|
82 |
for spk in spk_list:
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
soundfile.write(raw_path, dat, audio_sr, format="wav")
|
105 |
-
raw_path.seek(0)
|
106 |
-
out_audio, out_sr = svc_model.infer(spk, tran, raw_path,
|
107 |
-
cluster_infer_ratio=cluster_infer_ratio,
|
108 |
-
auto_predict_f0=auto_predict_f0,
|
109 |
-
noice_scale=noice_scale,
|
110 |
-
F0_mean_pooling = F0_mean_pooling
|
111 |
-
)
|
112 |
-
_audio = out_audio.cpu().numpy()
|
113 |
-
pad_len = int(svc_model.target_sample * pad_seconds)
|
114 |
-
_audio = _audio[pad_len:-pad_len]
|
115 |
-
_audio = infer_tool.pad_array(_audio, per_length)
|
116 |
-
if lg_size!=0 and k!=0:
|
117 |
-
lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr != 1 else audio[-lg_size:]
|
118 |
-
lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr != 1 else _audio[0:lg_size]
|
119 |
-
lg_pre = lg1*(1-lg)+lg2*lg
|
120 |
-
audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr != 1 else audio[0:-lg_size]
|
121 |
-
audio.extend(lg_pre)
|
122 |
-
_audio = _audio[lg_size_c_l+lg_size_r:] if lgr != 1 else _audio[lg_size:]
|
123 |
-
audio.extend(list(_audio))
|
124 |
key = "auto" if auto_predict_f0 else f"{tran}key"
|
125 |
cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
127 |
soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
|
|
|
|
|
128 |
|
129 |
if __name__ == '__main__':
|
130 |
main()
|
|
|
2 |
import logging
|
3 |
import time
|
4 |
from pathlib import Path
|
5 |
+
from spkmix import spk_mix_map
|
6 |
import librosa
|
7 |
import matplotlib.pyplot as plt
|
8 |
import numpy as np
|
9 |
import soundfile
|
|
|
10 |
from inference import infer_tool
|
11 |
from inference import slicer
|
12 |
from inference.infer_tool import Svc
|
|
|
15 |
chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
|
16 |
|
17 |
|
|
|
18 |
def main():
|
19 |
import argparse
|
20 |
|
21 |
parser = argparse.ArgumentParser(description='sovits4 inference')
|
22 |
|
23 |
# 一定要设置的部分
|
24 |
+
parser.add_argument('-m', '--model_path', type=str, default="logs/44k/", help='模型路径')
|
25 |
+
parser.add_argument('-c', '--config_path', type=str, default="configs/", help='配置文件路径')
|
26 |
parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片,默认0为自动切片,单位为秒/s')
|
27 |
+
parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["test.wav"],
|
28 |
+
help='wav文件名列表,放在raw文件夹下')
|
29 |
parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
|
30 |
+
parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['buyizi'], help='合成目标说话人名称')
|
31 |
|
32 |
# 可选项部分
|
33 |
+
parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False,
|
34 |
+
help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
|
35 |
+
parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt",
|
36 |
+
help='聚类模型或特征检索索引路径,如果没有训练聚类或特征检索则随便填')
|
37 |
+
parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0,
|
38 |
+
help='聚类方案或特征检索占比,范围0-1,若没有训练聚类模型或特征检索则默认0即可')
|
39 |
+
parser.add_argument('-lg', '--linear_gradient', type=float, default=0,
|
40 |
+
help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
|
41 |
+
parser.add_argument('-f0p', '--f0_predictor', type=str, default="harvest",
|
42 |
+
help='选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意:crepe为原F0使用均值滤波器)')
|
43 |
+
parser.add_argument('-eh', '--enhance', action='store_true', default=False,
|
44 |
+
help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
|
45 |
+
parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False,
|
46 |
+
help='是否使用浅层扩散,使用后可解决一部分电音问题,默认关闭,该选项打开时,NSF_HIFIGAN增强器将会被禁止')
|
47 |
+
parser.add_argument('-usm', '--use_spk_mix', action='store_true', default=False, help='是否使用角色融合')
|
48 |
+
parser.add_argument('-lea', '--loudness_envelope_adjustment', type=float, default=1,
|
49 |
+
help='输入源响度包络替换输出响度包络融合比例,越靠近1越使用输出响度包络')
|
50 |
+
parser.add_argument('-fr', '--feature_retrieval', action='store_true', default=False,
|
51 |
+
help='是否使用特征检索,如果使用聚类模型将被禁用,且cm与cr参数将会变成特征检索的索引路径与混合比例')
|
52 |
+
|
53 |
+
# 浅扩散设置
|
54 |
+
parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt",
|
55 |
+
help='扩散模型路径')
|
56 |
+
parser.add_argument('-dc', '--diffusion_config_path', type=str, default="logs/44k/diffusion/config.yaml",
|
57 |
+
help='扩散模型配置文件路径')
|
58 |
+
parser.add_argument('-ks', '--k_step', type=int, default=100, help='扩散步数,越大越接近扩散模型的结果,默认100')
|
59 |
+
parser.add_argument('-se', '--second_encoding', action='store_true', default=False,
|
60 |
+
help='二次编码,浅扩散前会对原始音频进行二次编码,玄学选项,有时候效果好,有时候效果差')
|
61 |
+
parser.add_argument('-od', '--only_diffusion', action='store_true', default=False,
|
62 |
+
help='纯扩散模式,该模式不会加载sovits模型,以扩散模型推理')
|
63 |
|
64 |
# 不用动的部分
|
65 |
+
parser.add_argument('-sd', '--slice_db', type=int, default=-40,
|
66 |
+
help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
|
67 |
parser.add_argument('-d', '--device', type=str, default=None, help='推理设备,None则为自动选择cpu和gpu')
|
68 |
parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='噪音级别,会影响咬字和音质,较为玄学')
|
69 |
+
parser.add_argument('-p', '--pad_seconds', type=float, default=0.5,
|
70 |
+
help='推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现')
|
71 |
parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
|
72 |
+
parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75,
|
73 |
+
help='自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭')
|
74 |
+
parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0,
|
75 |
+
help='使增强器适应更高的音域(单位为半音数)|默认为0')
|
76 |
+
parser.add_argument('-ft', '--f0_filter_threshold', type=float, default=0.05,
|
77 |
+
help='F0过滤阈值,只有使用crepe时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音')
|
78 |
+
|
79 |
+
def preprocess_args(args1):
|
80 |
+
spk1 = args1.spk_list[0]
|
81 |
+
args1.model_path += f"{spk1}.pth"
|
82 |
+
args1.config_path += f"config_{spk1}.json"
|
83 |
+
args1.clip = 30
|
84 |
+
|
85 |
+
if spk1 == 'tomori':
|
86 |
+
args1.feature_retrieval = True
|
87 |
+
args1.cluster_model_path = "logs/44k/tomori_index.pkl"
|
88 |
+
args1.cluster_infer_ratio = 0.5
|
89 |
+
args1.f0_predictor = 'crepe'
|
90 |
+
|
91 |
+
return args1
|
92 |
|
93 |
args = parser.parse_args()
|
94 |
+
args = preprocess_args(args)
|
95 |
|
|
|
|
|
96 |
clean_names = args.clean_names
|
97 |
trans = args.trans
|
98 |
spk_list = args.spk_list
|
|
|
105 |
clip = args.clip
|
106 |
lg = args.linear_gradient
|
107 |
lgr = args.linear_gradient_retain
|
108 |
+
f0p = args.f0_predictor
|
109 |
+
enhance = args.enhance
|
110 |
+
enhancer_adaptive_key = args.enhancer_adaptive_key
|
111 |
+
cr_threshold = args.f0_filter_threshold
|
112 |
+
diffusion_model_path = args.diffusion_model_path
|
113 |
+
diffusion_config_path = args.diffusion_config_path
|
114 |
+
k_step = args.k_step
|
115 |
+
only_diffusion = args.only_diffusion
|
116 |
+
shallow_diffusion = args.shallow_diffusion
|
117 |
+
use_spk_mix = args.use_spk_mix
|
118 |
+
second_encoding = args.second_encoding
|
119 |
+
loudness_envelope_adjustment = args.loudness_envelope_adjustment
|
120 |
+
|
121 |
+
svc_model = Svc(args.model_path,
|
122 |
+
args.config_path,
|
123 |
+
args.device,
|
124 |
+
args.cluster_model_path,
|
125 |
+
enhance,
|
126 |
+
diffusion_model_path,
|
127 |
+
diffusion_config_path,
|
128 |
+
shallow_diffusion,
|
129 |
+
only_diffusion,
|
130 |
+
use_spk_mix,
|
131 |
+
args.feature_retrieval)
|
132 |
+
|
133 |
+
infer_tool.mkdir(["raw", "results"])
|
134 |
+
|
135 |
+
if len(spk_mix_map) <= 1:
|
136 |
+
use_spk_mix = False
|
137 |
+
if use_spk_mix:
|
138 |
+
spk_list = [spk_mix_map]
|
139 |
|
140 |
infer_tool.fill_a_to_b(trans, clean_names)
|
141 |
for clean_name, tran in zip(clean_names, trans):
|
|
|
143 |
if "." not in raw_audio_path:
|
144 |
raw_audio_path += ".wav"
|
145 |
infer_tool.format_wav(raw_audio_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
for spk in spk_list:
|
147 |
+
kwarg = {
|
148 |
+
"raw_audio_path": raw_audio_path,
|
149 |
+
"spk": spk,
|
150 |
+
"tran": tran,
|
151 |
+
"slice_db": slice_db,
|
152 |
+
"cluster_infer_ratio": cluster_infer_ratio,
|
153 |
+
"auto_predict_f0": auto_predict_f0,
|
154 |
+
"noice_scale": noice_scale,
|
155 |
+
"pad_seconds": pad_seconds,
|
156 |
+
"clip_seconds": clip,
|
157 |
+
"lg_num": lg,
|
158 |
+
"lgr_num": lgr,
|
159 |
+
"f0_predictor": f0p,
|
160 |
+
"enhancer_adaptive_key": enhancer_adaptive_key,
|
161 |
+
"cr_threshold": cr_threshold,
|
162 |
+
"k_step": k_step,
|
163 |
+
"use_spk_mix": use_spk_mix,
|
164 |
+
"second_encoding": second_encoding,
|
165 |
+
"loudness_envelope_adjustment": loudness_envelope_adjustment
|
166 |
+
}
|
167 |
+
audio = svc_model.slice_inference(**kwarg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
key = "auto" if auto_predict_f0 else f"{tran}key"
|
169 |
cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
|
170 |
+
isdiffusion = "sovits"
|
171 |
+
if shallow_diffusion: isdiffusion = "sovdiff"
|
172 |
+
if only_diffusion: isdiffusion = "diff"
|
173 |
+
if use_spk_mix:
|
174 |
+
spk = "spk_mix"
|
175 |
+
res_path = f'results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}.{wav_format}'
|
176 |
soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
|
177 |
+
svc_model.clear_empty()
|
178 |
+
|
179 |
|
180 |
if __name__ == '__main__':
|
181 |
main()
|
models.py
CHANGED
@@ -13,111 +13,111 @@ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
|
13 |
|
14 |
import utils
|
15 |
from modules.commons import init_weights, get_padding
|
16 |
-
from vdecoder.hifigan.models import Generator
|
17 |
from utils import f0_to_coarse
|
18 |
|
19 |
class ResidualCouplingBlock(nn.Module):
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
50 |
|
51 |
|
52 |
class Encoder(nn.Module):
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
|
84 |
|
85 |
class TextEncoder(nn.Module):
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
self.enc_ = attentions.Encoder(
|
105 |
-
hidden_channels,
|
106 |
-
filter_channels,
|
107 |
-
n_heads,
|
108 |
-
n_layers,
|
109 |
-
kernel_size,
|
110 |
-
p_dropout)
|
111 |
-
|
112 |
-
def forward(self, x, x_mask, f0=None, noice_scale=1):
|
113 |
-
x = x + self.f0_emb(f0).transpose(1,2)
|
114 |
-
x = self.enc_(x * x_mask, x_mask)
|
115 |
-
stats = self.proj(x) * x_mask
|
116 |
-
m, logs = torch.split(stats, self.out_channels, dim=1)
|
117 |
-
z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
|
118 |
-
|
119 |
-
return z, m, logs, x_mask
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
|
123 |
class DiscriminatorP(torch.nn.Module):
|
@@ -140,7 +140,7 @@ class DiscriminatorP(torch.nn.Module):
|
|
140 |
|
141 |
# 1d to 2d
|
142 |
b, c, t = x.shape
|
143 |
-
if t % self.period != 0:
|
144 |
n_pad = self.period - (t % self.period)
|
145 |
x = F.pad(x, (0, n_pad), "reflect")
|
146 |
t = t + n_pad
|
@@ -188,7 +188,7 @@ class DiscriminatorS(torch.nn.Module):
|
|
188 |
class MultiPeriodDiscriminator(torch.nn.Module):
|
189 |
def __init__(self, use_spectral_norm=False):
|
190 |
super(MultiPeriodDiscriminator, self).__init__()
|
191 |
-
periods = [2,3,5,7,11]
|
192 |
|
193 |
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
194 |
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
@@ -225,26 +225,26 @@ class SpeakerEncoder(torch.nn.Module):
|
|
225 |
|
226 |
def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
|
227 |
mel_slices = []
|
228 |
-
for i in range(0, total_frames-partial_frames, partial_hop):
|
229 |
-
mel_range = torch.arange(i, i+partial_frames)
|
230 |
mel_slices.append(mel_range)
|
231 |
|
232 |
return mel_slices
|
233 |
|
234 |
def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
|
235 |
mel_len = mel.size(1)
|
236 |
-
last_mel = mel[
|
237 |
|
238 |
if mel_len > partial_frames:
|
239 |
mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
|
240 |
-
mels = list(mel[:,s] for s in mel_slices)
|
241 |
mels.append(last_mel)
|
242 |
mels = torch.stack(tuple(mels), 0).squeeze(1)
|
243 |
|
244 |
with torch.no_grad():
|
245 |
partial_embeds = self(mels)
|
246 |
embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
|
247 |
-
#embed = embed / torch.linalg.norm(embed, 2)
|
248 |
else:
|
249 |
with torch.no_grad():
|
250 |
embed = self(last_mel)
|
@@ -280,7 +280,7 @@ class F0Decoder(nn.Module):
|
|
280 |
kernel_size,
|
281 |
p_dropout)
|
282 |
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
283 |
-
self.f0_prenet = nn.Conv1d(1, hidden_channels
|
284 |
self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
|
285 |
|
286 |
def forward(self, x, norm_f0, x_mask, spk_emb=None):
|
@@ -295,126 +295,175 @@ class F0Decoder(nn.Module):
|
|
295 |
|
296 |
|
297 |
class SynthesizerTrn(nn.Module):
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
c_lengths =
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
|
|
|
|
|
|
|
|
|
|
412 |
lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
|
413 |
-
norm_lf0 = utils.normalize_f0(lf0, x_mask, uv
|
414 |
pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
|
415 |
-
f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
|
416 |
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
import utils
|
15 |
from modules.commons import init_weights, get_padding
|
|
|
16 |
from utils import f0_to_coarse
|
17 |
|
18 |
class ResidualCouplingBlock(nn.Module):
|
19 |
+
def __init__(self,
|
20 |
+
channels,
|
21 |
+
hidden_channels,
|
22 |
+
kernel_size,
|
23 |
+
dilation_rate,
|
24 |
+
n_layers,
|
25 |
+
n_flows=4,
|
26 |
+
gin_channels=0):
|
27 |
+
super().__init__()
|
28 |
+
self.channels = channels
|
29 |
+
self.hidden_channels = hidden_channels
|
30 |
+
self.kernel_size = kernel_size
|
31 |
+
self.dilation_rate = dilation_rate
|
32 |
+
self.n_layers = n_layers
|
33 |
+
self.n_flows = n_flows
|
34 |
+
self.gin_channels = gin_channels
|
35 |
+
|
36 |
+
self.flows = nn.ModuleList()
|
37 |
+
for i in range(n_flows):
|
38 |
+
self.flows.append(
|
39 |
+
modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
|
40 |
+
gin_channels=gin_channels, mean_only=True))
|
41 |
+
self.flows.append(modules.Flip())
|
42 |
+
|
43 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
44 |
+
if not reverse:
|
45 |
+
for flow in self.flows:
|
46 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
47 |
+
else:
|
48 |
+
for flow in reversed(self.flows):
|
49 |
+
x = flow(x, x_mask, g=g, reverse=reverse)
|
50 |
+
return x
|
51 |
|
52 |
|
53 |
class Encoder(nn.Module):
|
54 |
+
def __init__(self,
|
55 |
+
in_channels,
|
56 |
+
out_channels,
|
57 |
+
hidden_channels,
|
58 |
+
kernel_size,
|
59 |
+
dilation_rate,
|
60 |
+
n_layers,
|
61 |
+
gin_channels=0):
|
62 |
+
super().__init__()
|
63 |
+
self.in_channels = in_channels
|
64 |
+
self.out_channels = out_channels
|
65 |
+
self.hidden_channels = hidden_channels
|
66 |
+
self.kernel_size = kernel_size
|
67 |
+
self.dilation_rate = dilation_rate
|
68 |
+
self.n_layers = n_layers
|
69 |
+
self.gin_channels = gin_channels
|
70 |
+
|
71 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
72 |
+
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
73 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
74 |
+
|
75 |
+
def forward(self, x, x_lengths, g=None):
|
76 |
+
# print(x.shape,x_lengths.shape)
|
77 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
78 |
+
x = self.pre(x) * x_mask
|
79 |
+
x = self.enc(x, x_mask, g=g)
|
80 |
+
stats = self.proj(x) * x_mask
|
81 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
82 |
+
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
83 |
+
return z, m, logs, x_mask
|
84 |
|
85 |
|
86 |
class TextEncoder(nn.Module):
|
87 |
+
def __init__(self,
|
88 |
+
out_channels,
|
89 |
+
hidden_channels,
|
90 |
+
kernel_size,
|
91 |
+
n_layers,
|
92 |
+
gin_channels=0,
|
93 |
+
filter_channels=None,
|
94 |
+
n_heads=None,
|
95 |
+
p_dropout=None):
|
96 |
+
super().__init__()
|
97 |
+
self.out_channels = out_channels
|
98 |
+
self.hidden_channels = hidden_channels
|
99 |
+
self.kernel_size = kernel_size
|
100 |
+
self.n_layers = n_layers
|
101 |
+
self.gin_channels = gin_channels
|
102 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
103 |
+
self.f0_emb = nn.Embedding(256, hidden_channels)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
+
self.enc_ = attentions.Encoder(
|
106 |
+
hidden_channels,
|
107 |
+
filter_channels,
|
108 |
+
n_heads,
|
109 |
+
n_layers,
|
110 |
+
kernel_size,
|
111 |
+
p_dropout)
|
112 |
+
|
113 |
+
def forward(self, x, x_mask, f0=None, noice_scale=1):
|
114 |
+
x = x + self.f0_emb(f0).transpose(1, 2)
|
115 |
+
x = self.enc_(x * x_mask, x_mask)
|
116 |
+
stats = self.proj(x) * x_mask
|
117 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
118 |
+
z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
|
119 |
+
|
120 |
+
return z, m, logs, x_mask
|
121 |
|
122 |
|
123 |
class DiscriminatorP(torch.nn.Module):
|
|
|
140 |
|
141 |
# 1d to 2d
|
142 |
b, c, t = x.shape
|
143 |
+
if t % self.period != 0: # pad first
|
144 |
n_pad = self.period - (t % self.period)
|
145 |
x = F.pad(x, (0, n_pad), "reflect")
|
146 |
t = t + n_pad
|
|
|
188 |
class MultiPeriodDiscriminator(torch.nn.Module):
|
189 |
def __init__(self, use_spectral_norm=False):
|
190 |
super(MultiPeriodDiscriminator, self).__init__()
|
191 |
+
periods = [2, 3, 5, 7, 11]
|
192 |
|
193 |
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
194 |
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
|
|
225 |
|
226 |
def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
|
227 |
mel_slices = []
|
228 |
+
for i in range(0, total_frames - partial_frames, partial_hop):
|
229 |
+
mel_range = torch.arange(i, i + partial_frames)
|
230 |
mel_slices.append(mel_range)
|
231 |
|
232 |
return mel_slices
|
233 |
|
234 |
def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
|
235 |
mel_len = mel.size(1)
|
236 |
+
last_mel = mel[:, -partial_frames:]
|
237 |
|
238 |
if mel_len > partial_frames:
|
239 |
mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
|
240 |
+
mels = list(mel[:, s] for s in mel_slices)
|
241 |
mels.append(last_mel)
|
242 |
mels = torch.stack(tuple(mels), 0).squeeze(1)
|
243 |
|
244 |
with torch.no_grad():
|
245 |
partial_embeds = self(mels)
|
246 |
embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
|
247 |
+
# embed = embed / torch.linalg.norm(embed, 2)
|
248 |
else:
|
249 |
with torch.no_grad():
|
250 |
embed = self(last_mel)
|
|
|
280 |
kernel_size,
|
281 |
p_dropout)
|
282 |
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
283 |
+
self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
|
284 |
self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
|
285 |
|
286 |
def forward(self, x, norm_f0, x_mask, spk_emb=None):
|
|
|
295 |
|
296 |
|
297 |
class SynthesizerTrn(nn.Module):
|
298 |
+
"""
|
299 |
+
Synthesizer for Training
|
300 |
+
"""
|
301 |
+
|
302 |
+
def __init__(self,
|
303 |
+
spec_channels,
|
304 |
+
segment_size,
|
305 |
+
inter_channels,
|
306 |
+
hidden_channels,
|
307 |
+
filter_channels,
|
308 |
+
n_heads,
|
309 |
+
n_layers,
|
310 |
+
kernel_size,
|
311 |
+
p_dropout,
|
312 |
+
resblock,
|
313 |
+
resblock_kernel_sizes,
|
314 |
+
resblock_dilation_sizes,
|
315 |
+
upsample_rates,
|
316 |
+
upsample_initial_channel,
|
317 |
+
upsample_kernel_sizes,
|
318 |
+
gin_channels,
|
319 |
+
ssl_dim,
|
320 |
+
n_speakers,
|
321 |
+
sampling_rate=44100,
|
322 |
+
vol_embedding=False,
|
323 |
+
vocoder_name = "nsf-hifigan",
|
324 |
+
**kwargs):
|
325 |
+
|
326 |
+
super().__init__()
|
327 |
+
self.spec_channels = spec_channels
|
328 |
+
self.inter_channels = inter_channels
|
329 |
+
self.hidden_channels = hidden_channels
|
330 |
+
self.filter_channels = filter_channels
|
331 |
+
self.n_heads = n_heads
|
332 |
+
self.n_layers = n_layers
|
333 |
+
self.kernel_size = kernel_size
|
334 |
+
self.p_dropout = p_dropout
|
335 |
+
self.resblock = resblock
|
336 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
337 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
338 |
+
self.upsample_rates = upsample_rates
|
339 |
+
self.upsample_initial_channel = upsample_initial_channel
|
340 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
341 |
+
self.segment_size = segment_size
|
342 |
+
self.gin_channels = gin_channels
|
343 |
+
self.ssl_dim = ssl_dim
|
344 |
+
self.vol_embedding = vol_embedding
|
345 |
+
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
346 |
+
if vol_embedding:
|
347 |
+
self.emb_vol = nn.Linear(1, hidden_channels)
|
348 |
+
|
349 |
+
self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
|
350 |
+
|
351 |
+
self.enc_p = TextEncoder(
|
352 |
+
inter_channels,
|
353 |
+
hidden_channels,
|
354 |
+
filter_channels=filter_channels,
|
355 |
+
n_heads=n_heads,
|
356 |
+
n_layers=n_layers,
|
357 |
+
kernel_size=kernel_size,
|
358 |
+
p_dropout=p_dropout
|
359 |
+
)
|
360 |
+
hps = {
|
361 |
+
"sampling_rate": sampling_rate,
|
362 |
+
"inter_channels": inter_channels,
|
363 |
+
"resblock": resblock,
|
364 |
+
"resblock_kernel_sizes": resblock_kernel_sizes,
|
365 |
+
"resblock_dilation_sizes": resblock_dilation_sizes,
|
366 |
+
"upsample_rates": upsample_rates,
|
367 |
+
"upsample_initial_channel": upsample_initial_channel,
|
368 |
+
"upsample_kernel_sizes": upsample_kernel_sizes,
|
369 |
+
"gin_channels": gin_channels,
|
370 |
+
}
|
371 |
+
|
372 |
+
|
373 |
+
if vocoder_name == "nsf-hifigan":
|
374 |
+
from vdecoder.hifigan.models import Generator
|
375 |
+
self.dec = Generator(h=hps)
|
376 |
+
elif vocoder_name == "nsf-snake-hifigan":
|
377 |
+
from vdecoder.hifiganwithsnake.models import Generator
|
378 |
+
self.dec = Generator(h=hps)
|
379 |
+
else:
|
380 |
+
print("[?] Unkown vocoder: use default(nsf-hifigan)")
|
381 |
+
from vdecoder.hifigan.models import Generator
|
382 |
+
self.dec = Generator(h=hps)
|
383 |
+
|
384 |
+
self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
385 |
+
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
386 |
+
self.f0_decoder = F0Decoder(
|
387 |
+
1,
|
388 |
+
hidden_channels,
|
389 |
+
filter_channels,
|
390 |
+
n_heads,
|
391 |
+
n_layers,
|
392 |
+
kernel_size,
|
393 |
+
p_dropout,
|
394 |
+
spk_channels=gin_channels
|
395 |
+
)
|
396 |
+
self.emb_uv = nn.Embedding(2, hidden_channels)
|
397 |
+
self.character_mix = False
|
398 |
+
|
399 |
+
def EnableCharacterMix(self, n_speakers_map, device):
|
400 |
+
self.speaker_map = torch.zeros((n_speakers_map, 1, 1, self.gin_channels)).to(device)
|
401 |
+
for i in range(n_speakers_map):
|
402 |
+
self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]).to(device))
|
403 |
+
self.speaker_map = self.speaker_map.unsqueeze(0).to(device)
|
404 |
+
self.character_mix = True
|
405 |
+
|
406 |
+
def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None, vol = None):
|
407 |
+
g = self.emb_g(g).transpose(1,2)
|
408 |
+
|
409 |
+
# vol proj
|
410 |
+
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
|
411 |
+
|
412 |
+
# ssl prenet
|
413 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
414 |
+
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol
|
415 |
+
|
416 |
+
# f0 predict
|
417 |
lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
|
418 |
+
norm_lf0 = utils.normalize_f0(lf0, x_mask, uv)
|
419 |
pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
|
|
|
420 |
|
421 |
+
# encoder
|
422 |
+
z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
|
423 |
+
z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
|
424 |
+
|
425 |
+
# flow
|
426 |
+
z_p = self.flow(z, spec_mask, g=g)
|
427 |
+
z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size)
|
428 |
+
|
429 |
+
# nsf decoder
|
430 |
+
o = self.dec(z_slice, g=g, f0=pitch_slice)
|
431 |
+
|
432 |
+
return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0
|
433 |
+
|
434 |
+
def infer(self, c, f0, uv, g=None, noice_scale=0.35, seed=52468, predict_f0=False, vol = None):
|
435 |
+
|
436 |
+
if c.device == torch.device("cuda"):
|
437 |
+
torch.cuda.manual_seed_all(seed)
|
438 |
+
else:
|
439 |
+
torch.manual_seed(seed)
|
440 |
+
|
441 |
+
c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
|
442 |
+
|
443 |
+
if self.character_mix and len(g) > 1: # [N, S] * [S, B, 1, H]
|
444 |
+
g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
|
445 |
+
g = g * self.speaker_map # [N, S, B, 1, H]
|
446 |
+
g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
|
447 |
+
g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
|
448 |
+
else:
|
449 |
+
if g.dim() == 1:
|
450 |
+
g = g.unsqueeze(0)
|
451 |
+
g = self.emb_g(g).transpose(1, 2)
|
452 |
+
|
453 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
454 |
+
# vol proj
|
455 |
+
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
|
456 |
+
|
457 |
+
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol
|
458 |
+
|
459 |
+
if predict_f0:
|
460 |
+
lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
|
461 |
+
norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
|
462 |
+
pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
|
463 |
+
f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
|
464 |
+
|
465 |
+
z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
|
466 |
+
z = self.flow(z_p, c_mask, g=g, reverse=True)
|
467 |
+
o = self.dec(z * c_mask, g=g, f0=f0)
|
468 |
+
return o,f0
|
469 |
+
|
utils.py
CHANGED
@@ -6,18 +6,21 @@ import argparse
|
|
6 |
import logging
|
7 |
import json
|
8 |
import subprocess
|
|
|
9 |
import random
|
10 |
-
|
11 |
import librosa
|
12 |
import numpy as np
|
13 |
from scipy.io.wavfile import read
|
14 |
import torch
|
15 |
from torch.nn import functional as F
|
16 |
from modules.commons import sequence_mask
|
17 |
-
|
|
|
|
|
18 |
MATPLOTLIB_FLAG = False
|
19 |
|
20 |
-
logging.basicConfig(stream=sys.stdout, level=logging.
|
21 |
logger = logging
|
22 |
|
23 |
f0_bin = 256
|
@@ -26,26 +29,6 @@ f0_min = 50.0
|
|
26 |
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
27 |
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
28 |
|
29 |
-
|
30 |
-
# def normalize_f0(f0, random_scale=True):
|
31 |
-
# f0_norm = f0.clone() # create a copy of the input Tensor
|
32 |
-
# batch_size, _, frame_length = f0_norm.shape
|
33 |
-
# for i in range(batch_size):
|
34 |
-
# means = torch.mean(f0_norm[i, 0, :])
|
35 |
-
# if random_scale:
|
36 |
-
# factor = random.uniform(0.8, 1.2)
|
37 |
-
# else:
|
38 |
-
# factor = 1
|
39 |
-
# f0_norm[i, 0, :] = (f0_norm[i, 0, :] - means) * factor
|
40 |
-
# return f0_norm
|
41 |
-
# def normalize_f0(f0, random_scale=True):
|
42 |
-
# means = torch.mean(f0[:, 0, :], dim=1, keepdim=True)
|
43 |
-
# if random_scale:
|
44 |
-
# factor = torch.Tensor(f0.shape[0],1).uniform_(0.8, 1.2).to(f0.device)
|
45 |
-
# else:
|
46 |
-
# factor = torch.ones(f0.shape[0], 1, 1).to(f0.device)
|
47 |
-
# f0_norm = (f0 - means.unsqueeze(-1)) * factor.unsqueeze(-1)
|
48 |
-
# return f0_norm
|
49 |
def normalize_f0(f0, x_mask, uv, random_scale=True):
|
50 |
# calculate means based on x_mask
|
51 |
uv_sum = torch.sum(uv, dim=1, keepdim=True)
|
@@ -62,7 +45,6 @@ def normalize_f0(f0, x_mask, uv, random_scale=True):
|
|
62 |
exit(0)
|
63 |
return f0_norm * x_mask
|
64 |
|
65 |
-
|
66 |
def plot_data_to_numpy(x, y):
|
67 |
global MATPLOTLIB_FLAG
|
68 |
if not MATPLOTLIB_FLAG:
|
@@ -86,87 +68,6 @@ def plot_data_to_numpy(x, y):
|
|
86 |
return data
|
87 |
|
88 |
|
89 |
-
|
90 |
-
def interpolate_f0(f0):
|
91 |
-
'''
|
92 |
-
对F0进行插值处理
|
93 |
-
'''
|
94 |
-
|
95 |
-
data = np.reshape(f0, (f0.size, 1))
|
96 |
-
|
97 |
-
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
|
98 |
-
vuv_vector[data > 0.0] = 1.0
|
99 |
-
vuv_vector[data <= 0.0] = 0.0
|
100 |
-
|
101 |
-
ip_data = data
|
102 |
-
|
103 |
-
frame_number = data.size
|
104 |
-
last_value = 0.0
|
105 |
-
for i in range(frame_number):
|
106 |
-
if data[i] <= 0.0:
|
107 |
-
j = i + 1
|
108 |
-
for j in range(i + 1, frame_number):
|
109 |
-
if data[j] > 0.0:
|
110 |
-
break
|
111 |
-
if j < frame_number - 1:
|
112 |
-
if last_value > 0.0:
|
113 |
-
step = (data[j] - data[i - 1]) / float(j - i)
|
114 |
-
for k in range(i, j):
|
115 |
-
ip_data[k] = data[i - 1] + step * (k - i + 1)
|
116 |
-
else:
|
117 |
-
for k in range(i, j):
|
118 |
-
ip_data[k] = data[j]
|
119 |
-
else:
|
120 |
-
for k in range(i, frame_number):
|
121 |
-
ip_data[k] = last_value
|
122 |
-
else:
|
123 |
-
ip_data[i] = data[i]
|
124 |
-
last_value = data[i]
|
125 |
-
|
126 |
-
return ip_data[:,0], vuv_vector[:,0]
|
127 |
-
|
128 |
-
|
129 |
-
def compute_f0_parselmouth(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512):
|
130 |
-
import parselmouth
|
131 |
-
x = wav_numpy
|
132 |
-
if p_len is None:
|
133 |
-
p_len = x.shape[0]//hop_length
|
134 |
-
else:
|
135 |
-
assert abs(p_len-x.shape[0]//hop_length) < 4, "pad length error"
|
136 |
-
time_step = hop_length / sampling_rate * 1000
|
137 |
-
f0_min = 50
|
138 |
-
f0_max = 1100
|
139 |
-
f0 = parselmouth.Sound(x, sampling_rate).to_pitch_ac(
|
140 |
-
time_step=time_step / 1000, voicing_threshold=0.6,
|
141 |
-
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
|
142 |
-
|
143 |
-
pad_size=(p_len - len(f0) + 1) // 2
|
144 |
-
if(pad_size>0 or p_len - len(f0) - pad_size>0):
|
145 |
-
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
|
146 |
-
return f0
|
147 |
-
|
148 |
-
def resize_f0(x, target_len):
|
149 |
-
source = np.array(x)
|
150 |
-
source[source<0.001] = np.nan
|
151 |
-
target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source)
|
152 |
-
res = np.nan_to_num(target)
|
153 |
-
return res
|
154 |
-
|
155 |
-
def compute_f0_dio(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512):
|
156 |
-
import pyworld
|
157 |
-
if p_len is None:
|
158 |
-
p_len = wav_numpy.shape[0]//hop_length
|
159 |
-
f0, t = pyworld.dio(
|
160 |
-
wav_numpy.astype(np.double),
|
161 |
-
fs=sampling_rate,
|
162 |
-
f0_ceil=800,
|
163 |
-
frame_period=1000 * hop_length / sampling_rate,
|
164 |
-
)
|
165 |
-
f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate)
|
166 |
-
for index, pitch in enumerate(f0):
|
167 |
-
f0[index] = round(pitch, 1)
|
168 |
-
return resize_f0(f0, p_len)
|
169 |
-
|
170 |
def f0_to_coarse(f0):
|
171 |
is_torch = isinstance(f0, torch.Tensor)
|
172 |
f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
|
@@ -174,48 +75,73 @@ def f0_to_coarse(f0):
|
|
174 |
|
175 |
f0_mel[f0_mel <= 1] = 1
|
176 |
f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
|
177 |
-
f0_coarse = (f0_mel + 0.5).
|
178 |
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
|
179 |
return f0_coarse
|
180 |
|
181 |
-
|
182 |
-
def get_hubert_model():
|
183 |
-
vec_path = "hubert/checkpoint_best_legacy_500.pt"
|
184 |
-
print("load model(s) from {}".format(vec_path))
|
185 |
-
from fairseq import checkpoint_utils
|
186 |
-
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
187 |
-
[vec_path],
|
188 |
-
suffix="",
|
189 |
-
)
|
190 |
-
model = models[0]
|
191 |
-
model.eval()
|
192 |
-
return model
|
193 |
-
|
194 |
-
def get_hubert_content(hmodel, wav_16k_tensor):
|
195 |
-
feats = wav_16k_tensor
|
196 |
-
if feats.dim() == 2: # double channels
|
197 |
-
feats = feats.mean(-1)
|
198 |
-
assert feats.dim() == 1, feats.dim()
|
199 |
-
feats = feats.view(1, -1)
|
200 |
-
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
201 |
-
inputs = {
|
202 |
-
"source": feats.to(wav_16k_tensor.device),
|
203 |
-
"padding_mask": padding_mask.to(wav_16k_tensor.device),
|
204 |
-
"output_layer": 9, # layer 9
|
205 |
-
}
|
206 |
-
with torch.no_grad():
|
207 |
-
logits = hmodel.extract_features(**inputs)
|
208 |
-
feats = hmodel.final_proj(logits[0])
|
209 |
-
return feats.transpose(1, 2)
|
210 |
-
|
211 |
-
|
212 |
def get_content(cmodel, y):
|
213 |
with torch.no_grad():
|
214 |
c = cmodel.extract_features(y.squeeze(1))[0]
|
215 |
c = c.transpose(1, 2)
|
216 |
return c
|
217 |
|
218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):
|
221 |
assert os.path.isfile(checkpoint_path)
|
@@ -244,6 +170,7 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False
|
|
244 |
model.module.load_state_dict(new_state_dict)
|
245 |
else:
|
246 |
model.load_state_dict(new_state_dict)
|
|
|
247 |
logger.info("Loaded checkpoint '{}' (iteration {})".format(
|
248 |
checkpoint_path, iteration))
|
249 |
return model, optimizer, learning_rate, iteration
|
@@ -368,7 +295,7 @@ def load_filepaths_and_text(filename, split="|"):
|
|
368 |
|
369 |
def get_hparams(init=True):
|
370 |
parser = argparse.ArgumentParser()
|
371 |
-
parser.add_argument('-c', '--config', type=str, default="./configs/
|
372 |
help='JSON file for configuration')
|
373 |
parser.add_argument('-m', '--model', type=str, required=True,
|
374 |
help='Model name')
|
@@ -411,7 +338,6 @@ def get_hparams_from_file(config_path):
|
|
411 |
with open(config_path, "r") as f:
|
412 |
data = f.read()
|
413 |
config = json.loads(data)
|
414 |
-
|
415 |
hparams =HParams(**config)
|
416 |
return hparams
|
417 |
|
@@ -468,6 +394,73 @@ def repeat_expand_2d(content, target_len):
|
|
468 |
return target
|
469 |
|
470 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
471 |
class HParams():
|
472 |
def __init__(self, **kwargs):
|
473 |
for k, v in kwargs.items():
|
@@ -499,3 +492,19 @@ class HParams():
|
|
499 |
def __repr__(self):
|
500 |
return self.__dict__.__repr__()
|
501 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import logging
|
7 |
import json
|
8 |
import subprocess
|
9 |
+
import warnings
|
10 |
import random
|
11 |
+
import functools
|
12 |
import librosa
|
13 |
import numpy as np
|
14 |
from scipy.io.wavfile import read
|
15 |
import torch
|
16 |
from torch.nn import functional as F
|
17 |
from modules.commons import sequence_mask
|
18 |
+
import faiss
|
19 |
+
import tqdm
|
20 |
+
|
21 |
MATPLOTLIB_FLAG = False
|
22 |
|
23 |
+
logging.basicConfig(stream=sys.stdout, level=logging.WARN)
|
24 |
logger = logging
|
25 |
|
26 |
f0_bin = 256
|
|
|
29 |
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
30 |
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def normalize_f0(f0, x_mask, uv, random_scale=True):
|
33 |
# calculate means based on x_mask
|
34 |
uv_sum = torch.sum(uv, dim=1, keepdim=True)
|
|
|
45 |
exit(0)
|
46 |
return f0_norm * x_mask
|
47 |
|
|
|
48 |
def plot_data_to_numpy(x, y):
|
49 |
global MATPLOTLIB_FLAG
|
50 |
if not MATPLOTLIB_FLAG:
|
|
|
68 |
return data
|
69 |
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
def f0_to_coarse(f0):
|
72 |
is_torch = isinstance(f0, torch.Tensor)
|
73 |
f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
|
|
|
75 |
|
76 |
f0_mel[f0_mel <= 1] = 1
|
77 |
f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
|
78 |
+
f0_coarse = (f0_mel + 0.5).int() if is_torch else np.rint(f0_mel).astype(np.int)
|
79 |
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
|
80 |
return f0_coarse
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
def get_content(cmodel, y):
|
83 |
with torch.no_grad():
|
84 |
c = cmodel.extract_features(y.squeeze(1))[0]
|
85 |
c = c.transpose(1, 2)
|
86 |
return c
|
87 |
|
88 |
+
def get_f0_predictor(f0_predictor,hop_length,sampling_rate,**kargs):
|
89 |
+
if f0_predictor == "pm":
|
90 |
+
from modules.F0Predictor.PMF0Predictor import PMF0Predictor
|
91 |
+
f0_predictor_object = PMF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate)
|
92 |
+
elif f0_predictor == "crepe":
|
93 |
+
from modules.F0Predictor.CrepeF0Predictor import CrepeF0Predictor
|
94 |
+
f0_predictor_object = CrepeF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate,device=kargs["device"],threshold=kargs["threshold"])
|
95 |
+
elif f0_predictor == "harvest":
|
96 |
+
from modules.F0Predictor.HarvestF0Predictor import HarvestF0Predictor
|
97 |
+
f0_predictor_object = HarvestF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate)
|
98 |
+
elif f0_predictor == "dio":
|
99 |
+
from modules.F0Predictor.DioF0Predictor import DioF0Predictor
|
100 |
+
f0_predictor_object = DioF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate)
|
101 |
+
else:
|
102 |
+
raise Exception("Unknown f0 predictor")
|
103 |
+
return f0_predictor_object
|
104 |
+
|
105 |
+
def get_speech_encoder(speech_encoder,device=None,**kargs):
|
106 |
+
if speech_encoder == "vec768l12":
|
107 |
+
from vencoder.ContentVec768L12 import ContentVec768L12
|
108 |
+
speech_encoder_object = ContentVec768L12(device = device)
|
109 |
+
elif speech_encoder == "vec256l9":
|
110 |
+
from vencoder.ContentVec256L9 import ContentVec256L9
|
111 |
+
speech_encoder_object = ContentVec256L9(device = device)
|
112 |
+
elif speech_encoder == "vec256l9-onnx":
|
113 |
+
from vencoder.ContentVec256L9_Onnx import ContentVec256L9_Onnx
|
114 |
+
speech_encoder_object = ContentVec256L9_Onnx(device = device)
|
115 |
+
elif speech_encoder == "vec256l12-onnx":
|
116 |
+
from vencoder.ContentVec256L12_Onnx import ContentVec256L12_Onnx
|
117 |
+
speech_encoder_object = ContentVec256L12_Onnx(device = device)
|
118 |
+
elif speech_encoder == "vec768l9-onnx":
|
119 |
+
from vencoder.ContentVec768L9_Onnx import ContentVec768L9_Onnx
|
120 |
+
speech_encoder_object = ContentVec768L9_Onnx(device = device)
|
121 |
+
elif speech_encoder == "vec768l12-onnx":
|
122 |
+
from vencoder.ContentVec768L12_Onnx import ContentVec768L12_Onnx
|
123 |
+
speech_encoder_object = ContentVec768L12_Onnx(device = device)
|
124 |
+
elif speech_encoder == "hubertsoft-onnx":
|
125 |
+
from vencoder.HubertSoft_Onnx import HubertSoft_Onnx
|
126 |
+
speech_encoder_object = HubertSoft_Onnx(device = device)
|
127 |
+
elif speech_encoder == "hubertsoft":
|
128 |
+
from vencoder.HubertSoft import HubertSoft
|
129 |
+
speech_encoder_object = HubertSoft(device = device)
|
130 |
+
elif speech_encoder == "whisper-ppg":
|
131 |
+
from vencoder.WhisperPPG import WhisperPPG
|
132 |
+
speech_encoder_object = WhisperPPG(device = device)
|
133 |
+
elif speech_encoder == "cnhubertlarge":
|
134 |
+
from vencoder.CNHubertLarge import CNHubertLarge
|
135 |
+
speech_encoder_object = CNHubertLarge(device = device)
|
136 |
+
elif speech_encoder == "dphubert":
|
137 |
+
from vencoder.DPHubert import DPHubert
|
138 |
+
speech_encoder_object = DPHubert(device = device)
|
139 |
+
elif speech_encoder == "whisper-ppg-large":
|
140 |
+
from vencoder.WhisperPPGLarge import WhisperPPGLarge
|
141 |
+
speech_encoder_object = WhisperPPGLarge(device = device)
|
142 |
+
else:
|
143 |
+
raise Exception("Unknown speech encoder")
|
144 |
+
return speech_encoder_object
|
145 |
|
146 |
def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):
|
147 |
assert os.path.isfile(checkpoint_path)
|
|
|
170 |
model.module.load_state_dict(new_state_dict)
|
171 |
else:
|
172 |
model.load_state_dict(new_state_dict)
|
173 |
+
print("load ")
|
174 |
logger.info("Loaded checkpoint '{}' (iteration {})".format(
|
175 |
checkpoint_path, iteration))
|
176 |
return model, optimizer, learning_rate, iteration
|
|
|
295 |
|
296 |
def get_hparams(init=True):
|
297 |
parser = argparse.ArgumentParser()
|
298 |
+
parser.add_argument('-c', '--config', type=str, default="./configs/config.json",
|
299 |
help='JSON file for configuration')
|
300 |
parser.add_argument('-m', '--model', type=str, required=True,
|
301 |
help='Model name')
|
|
|
338 |
with open(config_path, "r") as f:
|
339 |
data = f.read()
|
340 |
config = json.loads(data)
|
|
|
341 |
hparams =HParams(**config)
|
342 |
return hparams
|
343 |
|
|
|
394 |
return target
|
395 |
|
396 |
|
397 |
+
def mix_model(model_paths,mix_rate,mode):
|
398 |
+
mix_rate = torch.FloatTensor(mix_rate)/100
|
399 |
+
model_tem = torch.load(model_paths[0])
|
400 |
+
models = [torch.load(path)["model"] for path in model_paths]
|
401 |
+
if mode == 0:
|
402 |
+
mix_rate = F.softmax(mix_rate,dim=0)
|
403 |
+
for k in model_tem["model"].keys():
|
404 |
+
model_tem["model"][k] = torch.zeros_like(model_tem["model"][k])
|
405 |
+
for i,model in enumerate(models):
|
406 |
+
model_tem["model"][k] += model[k]*mix_rate[i]
|
407 |
+
torch.save(model_tem,os.path.join(os.path.curdir,"output.pth"))
|
408 |
+
return os.path.join(os.path.curdir,"output.pth")
|
409 |
+
|
410 |
+
def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 from RVC
|
411 |
+
# print(data1.max(),data2.max())
|
412 |
+
rms1 = librosa.feature.rms(
|
413 |
+
y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
|
414 |
+
) # 每半秒一个点
|
415 |
+
rms2 = librosa.feature.rms(y=data2.detach().cpu().numpy(), frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
|
416 |
+
rms1 = torch.from_numpy(rms1).to(data2.device)
|
417 |
+
rms1 = F.interpolate(
|
418 |
+
rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
|
419 |
+
).squeeze()
|
420 |
+
rms2 = torch.from_numpy(rms2).to(data2.device)
|
421 |
+
rms2 = F.interpolate(
|
422 |
+
rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
|
423 |
+
).squeeze()
|
424 |
+
rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
|
425 |
+
data2 *= (
|
426 |
+
torch.pow(rms1, torch.tensor(1 - rate))
|
427 |
+
* torch.pow(rms2, torch.tensor(rate - 1))
|
428 |
+
)
|
429 |
+
return data2
|
430 |
+
|
431 |
+
def train_index(spk_name,root_dir = "dataset/44k/"): #from: RVC https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI
|
432 |
+
print("The feature index is constructing.")
|
433 |
+
exp_dir = os.path.join(root_dir,spk_name)
|
434 |
+
listdir_res = []
|
435 |
+
for file in os.listdir(exp_dir):
|
436 |
+
if ".wav.soft.pt" in file:
|
437 |
+
listdir_res.append(os.path.join(exp_dir,file))
|
438 |
+
if len(listdir_res) == 0:
|
439 |
+
raise Exception("You need to run preprocess_hubert_f0.py!")
|
440 |
+
npys = []
|
441 |
+
for name in sorted(listdir_res):
|
442 |
+
phone = torch.load(name)[0].transpose(-1,-2).numpy()
|
443 |
+
npys.append(phone)
|
444 |
+
big_npy = np.concatenate(npys, 0)
|
445 |
+
big_npy_idx = np.arange(big_npy.shape[0])
|
446 |
+
np.random.shuffle(big_npy_idx)
|
447 |
+
big_npy = big_npy[big_npy_idx]
|
448 |
+
n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
|
449 |
+
index = faiss.index_factory(big_npy.shape[1] , "IVF%s,Flat" % n_ivf)
|
450 |
+
index_ivf = faiss.extract_index_ivf(index) #
|
451 |
+
index_ivf.nprobe = 1
|
452 |
+
index.train(big_npy)
|
453 |
+
batch_size_add = 8192
|
454 |
+
for i in range(0, big_npy.shape[0], batch_size_add):
|
455 |
+
index.add(big_npy[i : i + batch_size_add])
|
456 |
+
# faiss.write_index(
|
457 |
+
# index,
|
458 |
+
# f"added_{spk_name}.index"
|
459 |
+
# )
|
460 |
+
print("Successfully build index")
|
461 |
+
return index
|
462 |
+
|
463 |
+
|
464 |
class HParams():
|
465 |
def __init__(self, **kwargs):
|
466 |
for k, v in kwargs.items():
|
|
|
492 |
def __repr__(self):
|
493 |
return self.__dict__.__repr__()
|
494 |
|
495 |
+
def get(self,index):
|
496 |
+
return self.__dict__.get(index)
|
497 |
+
|
498 |
+
class Volume_Extractor:
|
499 |
+
def __init__(self, hop_size = 512):
|
500 |
+
self.hop_size = hop_size
|
501 |
+
|
502 |
+
def extract(self, audio): # audio: 2d tensor array
|
503 |
+
if not isinstance(audio,torch.Tensor):
|
504 |
+
audio = torch.Tensor(audio)
|
505 |
+
n_frames = int(audio.size(-1) // self.hop_size)
|
506 |
+
audio2 = audio ** 2
|
507 |
+
audio2 = torch.nn.functional.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode = 'reflect')
|
508 |
+
volume = torch.FloatTensor([torch.mean(audio2[:,int(n * self.hop_size) : int((n + 1) * self.hop_size)]) for n in range(n_frames)])
|
509 |
+
volume = torch.sqrt(volume)
|
510 |
+
return volume
|