Spaces:
Runtime error
Runtime error
add
Browse files- app.py +6 -5
- export_index_for_onnx.py +0 -20
- flask_api.py +0 -60
- flask_api_full_song.py +0 -55
- logs/44k/1 +0 -0
- onnx_export.py +0 -144
- onnx_export_old.py +0 -56
- onnxexport/model_onnx.py +0 -333
- onnxexport/model_onnx_speaker_mix.py +0 -366
- train.py +0 -329
- train_diff.py +0 -77
- train_index.py +0 -30
- trained/put_trained_checkpoints_here +0 -0
- wav_upload.py +0 -25
- webUI.py +0 -430
app.py
CHANGED
@@ -21,7 +21,8 @@ def list_files_tree(directory, indent=""):
|
|
21 |
|
22 |
from huggingface_hub import snapshot_download
|
23 |
print("Models...")
|
24 |
-
models_id = """None1145/So-VITS-SVC-
|
|
|
25 |
for model_id in models_id.split("\n"):
|
26 |
if model_id in ["", " "]:
|
27 |
break
|
@@ -95,12 +96,12 @@ def vc_fn(input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scal
|
|
95 |
app = gr.Blocks()
|
96 |
with app:
|
97 |
with gr.Tabs():
|
98 |
-
with gr.
|
99 |
speaker = gr.Dropdown(label="讲话人", choices=speakers, value=speakers[0])
|
100 |
model_submit = gr.Button("加载模型", variant="primary")
|
101 |
model_output1 = gr.Textbox(label="Output Message")
|
102 |
-
|
103 |
-
with gr.
|
104 |
# sid = gr.Dropdown(label="音色", choices=speakers, value=speakers[0])
|
105 |
vc_input3 = gr.Audio(label="上传音频")
|
106 |
vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
|
@@ -111,5 +112,5 @@ with app:
|
|
111 |
vc_submit = gr.Button("转换", variant="primary")
|
112 |
vc_output1 = gr.Textbox(label="Output Message")
|
113 |
vc_output2 = gr.Audio(label="Output Audio")
|
114 |
-
|
115 |
app.launch()
|
|
|
21 |
|
22 |
from huggingface_hub import snapshot_download
|
23 |
print("Models...")
|
24 |
+
models_id = """None1145/So-VITS-SVC-Lappland
|
25 |
+
None1145/So-VITS-SVC-Lappland-the-Decadenza"""
|
26 |
for model_id in models_id.split("\n"):
|
27 |
if model_id in ["", " "]:
|
28 |
break
|
|
|
96 |
app = gr.Blocks()
|
97 |
with app:
|
98 |
with gr.Tabs():
|
99 |
+
with gr.Tabs():
|
100 |
speaker = gr.Dropdown(label="讲话人", choices=speakers, value=speakers[0])
|
101 |
model_submit = gr.Button("加载模型", variant="primary")
|
102 |
model_output1 = gr.Textbox(label="Output Message")
|
103 |
+
model_submit.click(load, [speaker], [model_output1])
|
104 |
+
with gr.Tabs():
|
105 |
# sid = gr.Dropdown(label="音色", choices=speakers, value=speakers[0])
|
106 |
vc_input3 = gr.Audio(label="上传音频")
|
107 |
vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
|
|
|
112 |
vc_submit = gr.Button("转换", variant="primary")
|
113 |
vc_output1 = gr.Textbox(label="Output Message")
|
114 |
vc_output2 = gr.Audio(label="Output Audio")
|
115 |
+
vc_submit.click(vc_fn, [vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale], [vc_output1, vc_output2])
|
116 |
app.launch()
|
export_index_for_onnx.py
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import pickle
|
3 |
-
|
4 |
-
import faiss
|
5 |
-
|
6 |
-
path = "crs"
|
7 |
-
indexs_file_path = f"checkpoints/{path}/feature_and_index.pkl"
|
8 |
-
indexs_out_dir = f"checkpoints/{path}/"
|
9 |
-
|
10 |
-
with open("feature_and_index.pkl",mode="rb") as f:
|
11 |
-
indexs = pickle.load(f)
|
12 |
-
|
13 |
-
for k in indexs:
|
14 |
-
print(f"Save {k} index")
|
15 |
-
faiss.write_index(
|
16 |
-
indexs[k],
|
17 |
-
os.path.join(indexs_out_dir,f"Index-{k}.index")
|
18 |
-
)
|
19 |
-
|
20 |
-
print("Saved all index")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
flask_api.py
DELETED
@@ -1,60 +0,0 @@
|
|
1 |
-
import io
|
2 |
-
import logging
|
3 |
-
|
4 |
-
import soundfile
|
5 |
-
import torch
|
6 |
-
import torchaudio
|
7 |
-
from flask import Flask, request, send_file
|
8 |
-
from flask_cors import CORS
|
9 |
-
|
10 |
-
from inference.infer_tool import RealTimeVC, Svc
|
11 |
-
|
12 |
-
app = Flask(__name__)
|
13 |
-
|
14 |
-
CORS(app)
|
15 |
-
|
16 |
-
logging.getLogger('numba').setLevel(logging.WARNING)
|
17 |
-
|
18 |
-
|
19 |
-
@app.route("/voiceChangeModel", methods=["POST"])
|
20 |
-
def voice_change_model():
|
21 |
-
request_form = request.form
|
22 |
-
wave_file = request.files.get("sample", None)
|
23 |
-
# 变调信息
|
24 |
-
f_pitch_change = float(request_form.get("fPitchChange", 0))
|
25 |
-
# DAW所需的采样率
|
26 |
-
daw_sample = int(float(request_form.get("sampleRate", 0)))
|
27 |
-
speaker_id = int(float(request_form.get("sSpeakId", 0)))
|
28 |
-
# http获得wav文件并转换
|
29 |
-
input_wav_path = io.BytesIO(wave_file.read())
|
30 |
-
|
31 |
-
# 模型推理
|
32 |
-
if raw_infer:
|
33 |
-
# out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
|
34 |
-
out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path, cluster_infer_ratio=0,
|
35 |
-
auto_predict_f0=False, noice_scale=0.4, f0_filter=False)
|
36 |
-
tar_audio = torchaudio.functional.resample(out_audio, svc_model.target_sample, daw_sample)
|
37 |
-
else:
|
38 |
-
out_audio = svc.process(svc_model, speaker_id, f_pitch_change, input_wav_path, cluster_infer_ratio=0,
|
39 |
-
auto_predict_f0=False, noice_scale=0.4, f0_filter=False)
|
40 |
-
tar_audio = torchaudio.functional.resample(torch.from_numpy(out_audio), svc_model.target_sample, daw_sample)
|
41 |
-
# 返回音频
|
42 |
-
out_wav_path = io.BytesIO()
|
43 |
-
soundfile.write(out_wav_path, tar_audio.cpu().numpy(), daw_sample, format="wav")
|
44 |
-
out_wav_path.seek(0)
|
45 |
-
return send_file(out_wav_path, download_name="temp.wav", as_attachment=True)
|
46 |
-
|
47 |
-
|
48 |
-
if __name__ == '__main__':
|
49 |
-
# 启用则为直接切片合成,False为交叉淡化方式
|
50 |
-
# vst插件调整0.3-0.5s切片时间可以降低延迟,直接切片方法会有连接处爆音、交叉淡化会有轻微重叠声音
|
51 |
-
# 自行选择能接受的方法,或将vst最大切片时间调整为1s,此处设为Ture,延迟大音质稳定一些
|
52 |
-
raw_infer = True
|
53 |
-
# 每个模型和config是唯一对应的
|
54 |
-
model_name = "logs/32k/G_174000-Copy1.pth"
|
55 |
-
config_name = "configs/config.json"
|
56 |
-
cluster_model_path = "logs/44k/kmeans_10000.pt"
|
57 |
-
svc_model = Svc(model_name, config_name, cluster_model_path=cluster_model_path)
|
58 |
-
svc = RealTimeVC()
|
59 |
-
# 此处与vst插件对应,不建议更改
|
60 |
-
app.run(port=6842, host="0.0.0.0", debug=False, threaded=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
flask_api_full_song.py
DELETED
@@ -1,55 +0,0 @@
|
|
1 |
-
import io
|
2 |
-
|
3 |
-
import numpy as np
|
4 |
-
import soundfile
|
5 |
-
from flask import Flask, request, send_file
|
6 |
-
|
7 |
-
from inference import infer_tool, slicer
|
8 |
-
|
9 |
-
app = Flask(__name__)
|
10 |
-
|
11 |
-
|
12 |
-
@app.route("/wav2wav", methods=["POST"])
|
13 |
-
def wav2wav():
|
14 |
-
request_form = request.form
|
15 |
-
audio_path = request_form.get("audio_path", None) # wav文件地址
|
16 |
-
tran = int(float(request_form.get("tran", 0))) # 音调
|
17 |
-
spk = request_form.get("spk", 0) # 说话人(id或者name都可以,具体看你的config)
|
18 |
-
wav_format = request_form.get("wav_format", 'wav') # 范围文件格式
|
19 |
-
infer_tool.format_wav(audio_path)
|
20 |
-
chunks = slicer.cut(audio_path, db_thresh=-40)
|
21 |
-
audio_data, audio_sr = slicer.chunks2audio(audio_path, chunks)
|
22 |
-
|
23 |
-
audio = []
|
24 |
-
for (slice_tag, data) in audio_data:
|
25 |
-
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
|
26 |
-
|
27 |
-
length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
|
28 |
-
if slice_tag:
|
29 |
-
print('jump empty segment')
|
30 |
-
_audio = np.zeros(length)
|
31 |
-
else:
|
32 |
-
# padd
|
33 |
-
pad_len = int(audio_sr * 0.5)
|
34 |
-
data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])])
|
35 |
-
raw_path = io.BytesIO()
|
36 |
-
soundfile.write(raw_path, data, audio_sr, format="wav")
|
37 |
-
raw_path.seek(0)
|
38 |
-
out_audio, out_sr = svc_model.infer(spk, tran, raw_path)
|
39 |
-
svc_model.clear_empty()
|
40 |
-
_audio = out_audio.cpu().numpy()
|
41 |
-
pad_len = int(svc_model.target_sample * 0.5)
|
42 |
-
_audio = _audio[pad_len:-pad_len]
|
43 |
-
|
44 |
-
audio.extend(list(infer_tool.pad_array(_audio, length)))
|
45 |
-
out_wav_path = io.BytesIO()
|
46 |
-
soundfile.write(out_wav_path, audio, svc_model.target_sample, format=wav_format)
|
47 |
-
out_wav_path.seek(0)
|
48 |
-
return send_file(out_wav_path, download_name=f"temp.{wav_format}", as_attachment=True)
|
49 |
-
|
50 |
-
|
51 |
-
if __name__ == '__main__':
|
52 |
-
model_name = "logs/44k/G_60000.pth" # 模型地址
|
53 |
-
config_name = "configs/config.json" # config地址
|
54 |
-
svc_model = infer_tool.Svc(model_name, config_name)
|
55 |
-
app.run(port=1145, host="0.0.0.0", debug=False, threaded=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logs/44k/1
DELETED
File without changes
|
onnx_export.py
DELETED
@@ -1,144 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import json
|
3 |
-
|
4 |
-
import torch
|
5 |
-
|
6 |
-
import utils
|
7 |
-
from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
|
8 |
-
|
9 |
-
parser = argparse.ArgumentParser(description='SoVitsSvc OnnxExport')
|
10 |
-
|
11 |
-
def OnnxExport(path=None):
|
12 |
-
device = torch.device("cpu")
|
13 |
-
hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
|
14 |
-
SVCVITS = SynthesizerTrn(
|
15 |
-
hps.data.filter_length // 2 + 1,
|
16 |
-
hps.train.segment_size // hps.data.hop_length,
|
17 |
-
**hps.model)
|
18 |
-
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
|
19 |
-
_ = SVCVITS.eval().to(device)
|
20 |
-
for i in SVCVITS.parameters():
|
21 |
-
i.requires_grad = False
|
22 |
-
|
23 |
-
num_frames = 200
|
24 |
-
|
25 |
-
test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels)
|
26 |
-
test_pitch = torch.rand(1, num_frames)
|
27 |
-
test_vol = torch.rand(1, num_frames)
|
28 |
-
test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0)
|
29 |
-
test_uv = torch.ones(1, num_frames, dtype=torch.float32)
|
30 |
-
test_noise = torch.randn(1, 192, num_frames)
|
31 |
-
test_sid = torch.LongTensor([0])
|
32 |
-
export_mix = True
|
33 |
-
if len(hps.spk) < 2:
|
34 |
-
export_mix = False
|
35 |
-
|
36 |
-
if export_mix:
|
37 |
-
spk_mix = []
|
38 |
-
n_spk = len(hps.spk)
|
39 |
-
for i in range(n_spk):
|
40 |
-
spk_mix.append(1.0/float(n_spk))
|
41 |
-
test_sid = torch.tensor(spk_mix)
|
42 |
-
SVCVITS.export_chara_mix(hps.spk)
|
43 |
-
test_sid = test_sid.unsqueeze(0)
|
44 |
-
test_sid = test_sid.repeat(num_frames, 1)
|
45 |
-
|
46 |
-
SVCVITS.eval()
|
47 |
-
|
48 |
-
if export_mix:
|
49 |
-
daxes = {
|
50 |
-
"c": [0, 1],
|
51 |
-
"f0": [1],
|
52 |
-
"mel2ph": [1],
|
53 |
-
"uv": [1],
|
54 |
-
"noise": [2],
|
55 |
-
"sid":[0]
|
56 |
-
}
|
57 |
-
else:
|
58 |
-
daxes = {
|
59 |
-
"c": [0, 1],
|
60 |
-
"f0": [1],
|
61 |
-
"mel2ph": [1],
|
62 |
-
"uv": [1],
|
63 |
-
"noise": [2]
|
64 |
-
}
|
65 |
-
|
66 |
-
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
|
67 |
-
output_names = ["audio", ]
|
68 |
-
|
69 |
-
if SVCVITS.vol_embedding:
|
70 |
-
input_names.append("vol")
|
71 |
-
vol_dadict = {"vol" : [1]}
|
72 |
-
daxes.update(vol_dadict)
|
73 |
-
test_inputs = (
|
74 |
-
test_hidden_unit.to(device),
|
75 |
-
test_pitch.to(device),
|
76 |
-
test_mel2ph.to(device),
|
77 |
-
test_uv.to(device),
|
78 |
-
test_noise.to(device),
|
79 |
-
test_sid.to(device),
|
80 |
-
test_vol.to(device)
|
81 |
-
)
|
82 |
-
else:
|
83 |
-
test_inputs = (
|
84 |
-
test_hidden_unit.to(device),
|
85 |
-
test_pitch.to(device),
|
86 |
-
test_mel2ph.to(device),
|
87 |
-
test_uv.to(device),
|
88 |
-
test_noise.to(device),
|
89 |
-
test_sid.to(device)
|
90 |
-
)
|
91 |
-
|
92 |
-
# SVCVITS = torch.jit.script(SVCVITS)
|
93 |
-
SVCVITS(test_hidden_unit.to(device),
|
94 |
-
test_pitch.to(device),
|
95 |
-
test_mel2ph.to(device),
|
96 |
-
test_uv.to(device),
|
97 |
-
test_noise.to(device),
|
98 |
-
test_sid.to(device),
|
99 |
-
test_vol.to(device))
|
100 |
-
|
101 |
-
SVCVITS.dec.OnnxExport()
|
102 |
-
|
103 |
-
torch.onnx.export(
|
104 |
-
SVCVITS,
|
105 |
-
test_inputs,
|
106 |
-
f"checkpoints/{path}/{path}_SoVits.onnx",
|
107 |
-
dynamic_axes=daxes,
|
108 |
-
do_constant_folding=False,
|
109 |
-
opset_version=16,
|
110 |
-
verbose=False,
|
111 |
-
input_names=input_names,
|
112 |
-
output_names=output_names
|
113 |
-
)
|
114 |
-
|
115 |
-
vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9"
|
116 |
-
spklist = []
|
117 |
-
for key in hps.spk.keys():
|
118 |
-
spklist.append(key)
|
119 |
-
|
120 |
-
MoeVSConf = {
|
121 |
-
"Folder" : f"{path}",
|
122 |
-
"Name" : f"{path}",
|
123 |
-
"Type" : "SoVits",
|
124 |
-
"Rate" : hps.data.sampling_rate,
|
125 |
-
"Hop" : hps.data.hop_length,
|
126 |
-
"Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}",
|
127 |
-
"SoVits4": True,
|
128 |
-
"SoVits3": False,
|
129 |
-
"CharaMix": export_mix,
|
130 |
-
"Volume": SVCVITS.vol_embedding,
|
131 |
-
"HiddenSize": SVCVITS.gin_channels,
|
132 |
-
"Characters": spklist,
|
133 |
-
"Cluster": ""
|
134 |
-
}
|
135 |
-
|
136 |
-
with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
|
137 |
-
json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
|
138 |
-
|
139 |
-
|
140 |
-
if __name__ == '__main__':
|
141 |
-
parser.add_argument('-n', '--model_name', type=str, default="TransformerFlow", help='模型文件夹名(根目录下新建ckeckpoints文件夹,在此文件夹下建立一个新的文件夹,放置模型,该文件夹名即为此项)')
|
142 |
-
args = parser.parse_args()
|
143 |
-
path = args.model_name
|
144 |
-
OnnxExport(path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
onnx_export_old.py
DELETED
@@ -1,56 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
|
3 |
-
import utils
|
4 |
-
from onnxexport.model_onnx import SynthesizerTrn
|
5 |
-
|
6 |
-
|
7 |
-
def main(NetExport):
|
8 |
-
path = "SoVits4.0"
|
9 |
-
if NetExport:
|
10 |
-
device = torch.device("cpu")
|
11 |
-
hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
|
12 |
-
SVCVITS = SynthesizerTrn(
|
13 |
-
hps.data.filter_length // 2 + 1,
|
14 |
-
hps.train.segment_size // hps.data.hop_length,
|
15 |
-
**hps.model)
|
16 |
-
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
|
17 |
-
_ = SVCVITS.eval().to(device)
|
18 |
-
for i in SVCVITS.parameters():
|
19 |
-
i.requires_grad = False
|
20 |
-
|
21 |
-
n_frame = 10
|
22 |
-
test_hidden_unit = torch.rand(1, n_frame, 256)
|
23 |
-
test_pitch = torch.rand(1, n_frame)
|
24 |
-
test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
|
25 |
-
test_uv = torch.ones(1, n_frame, dtype=torch.float32)
|
26 |
-
test_noise = torch.randn(1, 192, n_frame)
|
27 |
-
test_sid = torch.LongTensor([0])
|
28 |
-
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
|
29 |
-
output_names = ["audio", ]
|
30 |
-
|
31 |
-
torch.onnx.export(SVCVITS,
|
32 |
-
(
|
33 |
-
test_hidden_unit.to(device),
|
34 |
-
test_pitch.to(device),
|
35 |
-
test_mel2ph.to(device),
|
36 |
-
test_uv.to(device),
|
37 |
-
test_noise.to(device),
|
38 |
-
test_sid.to(device)
|
39 |
-
),
|
40 |
-
f"checkpoints/{path}/model.onnx",
|
41 |
-
dynamic_axes={
|
42 |
-
"c": [0, 1],
|
43 |
-
"f0": [1],
|
44 |
-
"mel2ph": [1],
|
45 |
-
"uv": [1],
|
46 |
-
"noise": [2],
|
47 |
-
},
|
48 |
-
do_constant_folding=False,
|
49 |
-
opset_version=16,
|
50 |
-
verbose=False,
|
51 |
-
input_names=input_names,
|
52 |
-
output_names=output_names)
|
53 |
-
|
54 |
-
|
55 |
-
if __name__ == '__main__':
|
56 |
-
main(True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
onnxexport/model_onnx.py
DELETED
@@ -1,333 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from torch import nn
|
3 |
-
from torch.nn import Conv1d, Conv2d
|
4 |
-
from torch.nn import functional as F
|
5 |
-
from torch.nn.utils import spectral_norm, weight_norm
|
6 |
-
|
7 |
-
import modules.attentions as attentions
|
8 |
-
import modules.commons as commons
|
9 |
-
import modules.modules as modules
|
10 |
-
import utils
|
11 |
-
from modules.commons import get_padding
|
12 |
-
from utils import f0_to_coarse
|
13 |
-
from vdecoder.hifigan.models import Generator
|
14 |
-
|
15 |
-
|
16 |
-
class ResidualCouplingBlock(nn.Module):
|
17 |
-
def __init__(self,
|
18 |
-
channels,
|
19 |
-
hidden_channels,
|
20 |
-
kernel_size,
|
21 |
-
dilation_rate,
|
22 |
-
n_layers,
|
23 |
-
n_flows=4,
|
24 |
-
gin_channels=0):
|
25 |
-
super().__init__()
|
26 |
-
self.channels = channels
|
27 |
-
self.hidden_channels = hidden_channels
|
28 |
-
self.kernel_size = kernel_size
|
29 |
-
self.dilation_rate = dilation_rate
|
30 |
-
self.n_layers = n_layers
|
31 |
-
self.n_flows = n_flows
|
32 |
-
self.gin_channels = gin_channels
|
33 |
-
|
34 |
-
self.flows = nn.ModuleList()
|
35 |
-
for i in range(n_flows):
|
36 |
-
self.flows.append(
|
37 |
-
modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
|
38 |
-
gin_channels=gin_channels, mean_only=True))
|
39 |
-
self.flows.append(modules.Flip())
|
40 |
-
|
41 |
-
def forward(self, x, x_mask, g=None, reverse=False):
|
42 |
-
if not reverse:
|
43 |
-
for flow in self.flows:
|
44 |
-
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
45 |
-
else:
|
46 |
-
for flow in reversed(self.flows):
|
47 |
-
x = flow(x, x_mask, g=g, reverse=reverse)
|
48 |
-
return x
|
49 |
-
|
50 |
-
|
51 |
-
class Encoder(nn.Module):
|
52 |
-
def __init__(self,
|
53 |
-
in_channels,
|
54 |
-
out_channels,
|
55 |
-
hidden_channels,
|
56 |
-
kernel_size,
|
57 |
-
dilation_rate,
|
58 |
-
n_layers,
|
59 |
-
gin_channels=0):
|
60 |
-
super().__init__()
|
61 |
-
self.in_channels = in_channels
|
62 |
-
self.out_channels = out_channels
|
63 |
-
self.hidden_channels = hidden_channels
|
64 |
-
self.kernel_size = kernel_size
|
65 |
-
self.dilation_rate = dilation_rate
|
66 |
-
self.n_layers = n_layers
|
67 |
-
self.gin_channels = gin_channels
|
68 |
-
|
69 |
-
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
70 |
-
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
71 |
-
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
72 |
-
|
73 |
-
def forward(self, x, x_lengths, g=None):
|
74 |
-
# print(x.shape,x_lengths.shape)
|
75 |
-
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
76 |
-
x = self.pre(x) * x_mask
|
77 |
-
x = self.enc(x, x_mask, g=g)
|
78 |
-
stats = self.proj(x) * x_mask
|
79 |
-
m, logs = torch.split(stats, self.out_channels, dim=1)
|
80 |
-
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
81 |
-
return z, m, logs, x_mask
|
82 |
-
|
83 |
-
|
84 |
-
class TextEncoder(nn.Module):
|
85 |
-
def __init__(self,
|
86 |
-
out_channels,
|
87 |
-
hidden_channels,
|
88 |
-
kernel_size,
|
89 |
-
n_layers,
|
90 |
-
gin_channels=0,
|
91 |
-
filter_channels=None,
|
92 |
-
n_heads=None,
|
93 |
-
p_dropout=None):
|
94 |
-
super().__init__()
|
95 |
-
self.out_channels = out_channels
|
96 |
-
self.hidden_channels = hidden_channels
|
97 |
-
self.kernel_size = kernel_size
|
98 |
-
self.n_layers = n_layers
|
99 |
-
self.gin_channels = gin_channels
|
100 |
-
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
101 |
-
self.f0_emb = nn.Embedding(256, hidden_channels)
|
102 |
-
|
103 |
-
self.enc_ = attentions.Encoder(
|
104 |
-
hidden_channels,
|
105 |
-
filter_channels,
|
106 |
-
n_heads,
|
107 |
-
n_layers,
|
108 |
-
kernel_size,
|
109 |
-
p_dropout)
|
110 |
-
|
111 |
-
def forward(self, x, x_mask, f0=None, z=None):
|
112 |
-
x = x + self.f0_emb(f0).transpose(1, 2)
|
113 |
-
x = self.enc_(x * x_mask, x_mask)
|
114 |
-
stats = self.proj(x) * x_mask
|
115 |
-
m, logs = torch.split(stats, self.out_channels, dim=1)
|
116 |
-
z = (m + z * torch.exp(logs)) * x_mask
|
117 |
-
return z, m, logs, x_mask
|
118 |
-
|
119 |
-
|
120 |
-
class DiscriminatorP(torch.nn.Module):
|
121 |
-
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
122 |
-
super(DiscriminatorP, self).__init__()
|
123 |
-
self.period = period
|
124 |
-
self.use_spectral_norm = use_spectral_norm
|
125 |
-
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
126 |
-
self.convs = nn.ModuleList([
|
127 |
-
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
128 |
-
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
129 |
-
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
130 |
-
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
131 |
-
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
|
132 |
-
])
|
133 |
-
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
134 |
-
|
135 |
-
def forward(self, x):
|
136 |
-
fmap = []
|
137 |
-
|
138 |
-
# 1d to 2d
|
139 |
-
b, c, t = x.shape
|
140 |
-
if t % self.period != 0: # pad first
|
141 |
-
n_pad = self.period - (t % self.period)
|
142 |
-
x = F.pad(x, (0, n_pad), "reflect")
|
143 |
-
t = t + n_pad
|
144 |
-
x = x.view(b, c, t // self.period, self.period)
|
145 |
-
|
146 |
-
for l in self.convs:
|
147 |
-
x = l(x)
|
148 |
-
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
149 |
-
fmap.append(x)
|
150 |
-
x = self.conv_post(x)
|
151 |
-
fmap.append(x)
|
152 |
-
x = torch.flatten(x, 1, -1)
|
153 |
-
|
154 |
-
return x, fmap
|
155 |
-
|
156 |
-
|
157 |
-
class DiscriminatorS(torch.nn.Module):
|
158 |
-
def __init__(self, use_spectral_norm=False):
|
159 |
-
super(DiscriminatorS, self).__init__()
|
160 |
-
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
161 |
-
self.convs = nn.ModuleList([
|
162 |
-
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
163 |
-
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
164 |
-
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
165 |
-
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
166 |
-
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
167 |
-
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
168 |
-
])
|
169 |
-
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
170 |
-
|
171 |
-
def forward(self, x):
|
172 |
-
fmap = []
|
173 |
-
|
174 |
-
for l in self.convs:
|
175 |
-
x = l(x)
|
176 |
-
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
177 |
-
fmap.append(x)
|
178 |
-
x = self.conv_post(x)
|
179 |
-
fmap.append(x)
|
180 |
-
x = torch.flatten(x, 1, -1)
|
181 |
-
|
182 |
-
return x, fmap
|
183 |
-
|
184 |
-
|
185 |
-
class F0Decoder(nn.Module):
|
186 |
-
def __init__(self,
|
187 |
-
out_channels,
|
188 |
-
hidden_channels,
|
189 |
-
filter_channels,
|
190 |
-
n_heads,
|
191 |
-
n_layers,
|
192 |
-
kernel_size,
|
193 |
-
p_dropout,
|
194 |
-
spk_channels=0):
|
195 |
-
super().__init__()
|
196 |
-
self.out_channels = out_channels
|
197 |
-
self.hidden_channels = hidden_channels
|
198 |
-
self.filter_channels = filter_channels
|
199 |
-
self.n_heads = n_heads
|
200 |
-
self.n_layers = n_layers
|
201 |
-
self.kernel_size = kernel_size
|
202 |
-
self.p_dropout = p_dropout
|
203 |
-
self.spk_channels = spk_channels
|
204 |
-
|
205 |
-
self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
|
206 |
-
self.decoder = attentions.FFT(
|
207 |
-
hidden_channels,
|
208 |
-
filter_channels,
|
209 |
-
n_heads,
|
210 |
-
n_layers,
|
211 |
-
kernel_size,
|
212 |
-
p_dropout)
|
213 |
-
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
214 |
-
self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
|
215 |
-
self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
|
216 |
-
|
217 |
-
def forward(self, x, norm_f0, x_mask, spk_emb=None):
|
218 |
-
x = torch.detach(x)
|
219 |
-
if spk_emb is not None:
|
220 |
-
x = x + self.cond(spk_emb)
|
221 |
-
x += self.f0_prenet(norm_f0)
|
222 |
-
x = self.prenet(x) * x_mask
|
223 |
-
x = self.decoder(x * x_mask, x_mask)
|
224 |
-
x = self.proj(x) * x_mask
|
225 |
-
return x
|
226 |
-
|
227 |
-
|
228 |
-
class SynthesizerTrn(nn.Module):
|
229 |
-
"""
|
230 |
-
Synthesizer for Training
|
231 |
-
"""
|
232 |
-
|
233 |
-
def __init__(self,
|
234 |
-
spec_channels,
|
235 |
-
segment_size,
|
236 |
-
inter_channels,
|
237 |
-
hidden_channels,
|
238 |
-
filter_channels,
|
239 |
-
n_heads,
|
240 |
-
n_layers,
|
241 |
-
kernel_size,
|
242 |
-
p_dropout,
|
243 |
-
resblock,
|
244 |
-
resblock_kernel_sizes,
|
245 |
-
resblock_dilation_sizes,
|
246 |
-
upsample_rates,
|
247 |
-
upsample_initial_channel,
|
248 |
-
upsample_kernel_sizes,
|
249 |
-
gin_channels,
|
250 |
-
ssl_dim,
|
251 |
-
n_speakers,
|
252 |
-
sampling_rate=44100,
|
253 |
-
**kwargs):
|
254 |
-
super().__init__()
|
255 |
-
self.spec_channels = spec_channels
|
256 |
-
self.inter_channels = inter_channels
|
257 |
-
self.hidden_channels = hidden_channels
|
258 |
-
self.filter_channels = filter_channels
|
259 |
-
self.n_heads = n_heads
|
260 |
-
self.n_layers = n_layers
|
261 |
-
self.kernel_size = kernel_size
|
262 |
-
self.p_dropout = p_dropout
|
263 |
-
self.resblock = resblock
|
264 |
-
self.resblock_kernel_sizes = resblock_kernel_sizes
|
265 |
-
self.resblock_dilation_sizes = resblock_dilation_sizes
|
266 |
-
self.upsample_rates = upsample_rates
|
267 |
-
self.upsample_initial_channel = upsample_initial_channel
|
268 |
-
self.upsample_kernel_sizes = upsample_kernel_sizes
|
269 |
-
self.segment_size = segment_size
|
270 |
-
self.gin_channels = gin_channels
|
271 |
-
self.ssl_dim = ssl_dim
|
272 |
-
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
273 |
-
|
274 |
-
self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
|
275 |
-
|
276 |
-
self.enc_p = TextEncoder(
|
277 |
-
inter_channels,
|
278 |
-
hidden_channels,
|
279 |
-
filter_channels=filter_channels,
|
280 |
-
n_heads=n_heads,
|
281 |
-
n_layers=n_layers,
|
282 |
-
kernel_size=kernel_size,
|
283 |
-
p_dropout=p_dropout
|
284 |
-
)
|
285 |
-
hps = {
|
286 |
-
"sampling_rate": sampling_rate,
|
287 |
-
"inter_channels": inter_channels,
|
288 |
-
"resblock": resblock,
|
289 |
-
"resblock_kernel_sizes": resblock_kernel_sizes,
|
290 |
-
"resblock_dilation_sizes": resblock_dilation_sizes,
|
291 |
-
"upsample_rates": upsample_rates,
|
292 |
-
"upsample_initial_channel": upsample_initial_channel,
|
293 |
-
"upsample_kernel_sizes": upsample_kernel_sizes,
|
294 |
-
"gin_channels": gin_channels,
|
295 |
-
}
|
296 |
-
self.dec = Generator(h=hps)
|
297 |
-
self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
298 |
-
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
299 |
-
self.f0_decoder = F0Decoder(
|
300 |
-
1,
|
301 |
-
hidden_channels,
|
302 |
-
filter_channels,
|
303 |
-
n_heads,
|
304 |
-
n_layers,
|
305 |
-
kernel_size,
|
306 |
-
p_dropout,
|
307 |
-
spk_channels=gin_channels
|
308 |
-
)
|
309 |
-
self.emb_uv = nn.Embedding(2, hidden_channels)
|
310 |
-
self.predict_f0 = False
|
311 |
-
|
312 |
-
def forward(self, c, f0, mel2ph, uv, noise=None, g=None):
|
313 |
-
|
314 |
-
decoder_inp = F.pad(c, [0, 0, 1, 0])
|
315 |
-
mel2ph_ = mel2ph.unsqueeze(2).repeat([1, 1, c.shape[-1]])
|
316 |
-
c = torch.gather(decoder_inp, 1, mel2ph_).transpose(1, 2) # [B, T, H]
|
317 |
-
|
318 |
-
c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
|
319 |
-
g = g.unsqueeze(0)
|
320 |
-
g = self.emb_g(g).transpose(1, 2)
|
321 |
-
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
322 |
-
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
|
323 |
-
|
324 |
-
if self.predict_f0:
|
325 |
-
lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
|
326 |
-
norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
|
327 |
-
pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
|
328 |
-
f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
|
329 |
-
|
330 |
-
z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), z=noise)
|
331 |
-
z = self.flow(z_p, c_mask, g=g, reverse=True)
|
332 |
-
o = self.dec(z * c_mask, g=g, f0=f0)
|
333 |
-
return o
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
onnxexport/model_onnx_speaker_mix.py
DELETED
@@ -1,366 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from torch import nn
|
3 |
-
from torch.nn import functional as F
|
4 |
-
|
5 |
-
import modules.attentions as attentions
|
6 |
-
import modules.commons as commons
|
7 |
-
import modules.modules as modules
|
8 |
-
import utils
|
9 |
-
from utils import f0_to_coarse
|
10 |
-
|
11 |
-
|
12 |
-
class ResidualCouplingBlock(nn.Module):
|
13 |
-
def __init__(self,
|
14 |
-
channels,
|
15 |
-
hidden_channels,
|
16 |
-
kernel_size,
|
17 |
-
dilation_rate,
|
18 |
-
n_layers,
|
19 |
-
n_flows=4,
|
20 |
-
gin_channels=0,
|
21 |
-
share_parameter=False
|
22 |
-
):
|
23 |
-
super().__init__()
|
24 |
-
self.channels = channels
|
25 |
-
self.hidden_channels = hidden_channels
|
26 |
-
self.kernel_size = kernel_size
|
27 |
-
self.dilation_rate = dilation_rate
|
28 |
-
self.n_layers = n_layers
|
29 |
-
self.n_flows = n_flows
|
30 |
-
self.gin_channels = gin_channels
|
31 |
-
|
32 |
-
self.flows = nn.ModuleList()
|
33 |
-
|
34 |
-
self.wn = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=gin_channels) if share_parameter else None
|
35 |
-
|
36 |
-
for i in range(n_flows):
|
37 |
-
self.flows.append(
|
38 |
-
modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
|
39 |
-
gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn))
|
40 |
-
self.flows.append(modules.Flip())
|
41 |
-
|
42 |
-
def forward(self, x, x_mask, g=None, reverse=False):
|
43 |
-
if not reverse:
|
44 |
-
for flow in self.flows:
|
45 |
-
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
46 |
-
else:
|
47 |
-
for flow in reversed(self.flows):
|
48 |
-
x = flow(x, x_mask, g=g, reverse=reverse)
|
49 |
-
return x
|
50 |
-
|
51 |
-
class TransformerCouplingBlock(nn.Module):
|
52 |
-
def __init__(self,
|
53 |
-
channels,
|
54 |
-
hidden_channels,
|
55 |
-
filter_channels,
|
56 |
-
n_heads,
|
57 |
-
n_layers,
|
58 |
-
kernel_size,
|
59 |
-
p_dropout,
|
60 |
-
n_flows=4,
|
61 |
-
gin_channels=0,
|
62 |
-
share_parameter=False
|
63 |
-
):
|
64 |
-
|
65 |
-
super().__init__()
|
66 |
-
self.channels = channels
|
67 |
-
self.hidden_channels = hidden_channels
|
68 |
-
self.kernel_size = kernel_size
|
69 |
-
self.n_layers = n_layers
|
70 |
-
self.n_flows = n_flows
|
71 |
-
self.gin_channels = gin_channels
|
72 |
-
|
73 |
-
self.flows = nn.ModuleList()
|
74 |
-
|
75 |
-
self.wn = attentions.FFT(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, isflow = True, gin_channels = self.gin_channels) if share_parameter else None
|
76 |
-
|
77 |
-
for i in range(n_flows):
|
78 |
-
self.flows.append(
|
79 |
-
modules.TransformerCouplingLayer(channels, hidden_channels, kernel_size, n_layers, n_heads, p_dropout, filter_channels, mean_only=True, wn_sharing_parameter=self.wn, gin_channels = self.gin_channels))
|
80 |
-
self.flows.append(modules.Flip())
|
81 |
-
|
82 |
-
def forward(self, x, x_mask, g=None, reverse=False):
|
83 |
-
if not reverse:
|
84 |
-
for flow in self.flows:
|
85 |
-
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
86 |
-
else:
|
87 |
-
for flow in reversed(self.flows):
|
88 |
-
x = flow(x, x_mask, g=g, reverse=reverse)
|
89 |
-
return x
|
90 |
-
|
91 |
-
|
92 |
-
class Encoder(nn.Module):
|
93 |
-
def __init__(self,
|
94 |
-
in_channels,
|
95 |
-
out_channels,
|
96 |
-
hidden_channels,
|
97 |
-
kernel_size,
|
98 |
-
dilation_rate,
|
99 |
-
n_layers,
|
100 |
-
gin_channels=0):
|
101 |
-
super().__init__()
|
102 |
-
self.in_channels = in_channels
|
103 |
-
self.out_channels = out_channels
|
104 |
-
self.hidden_channels = hidden_channels
|
105 |
-
self.kernel_size = kernel_size
|
106 |
-
self.dilation_rate = dilation_rate
|
107 |
-
self.n_layers = n_layers
|
108 |
-
self.gin_channels = gin_channels
|
109 |
-
|
110 |
-
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
111 |
-
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
112 |
-
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
113 |
-
|
114 |
-
def forward(self, x, x_lengths, g=None):
|
115 |
-
# print(x.shape,x_lengths.shape)
|
116 |
-
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
117 |
-
x = self.pre(x) * x_mask
|
118 |
-
x = self.enc(x, x_mask, g=g)
|
119 |
-
stats = self.proj(x) * x_mask
|
120 |
-
m, logs = torch.split(stats, self.out_channels, dim=1)
|
121 |
-
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
122 |
-
return z, m, logs, x_mask
|
123 |
-
|
124 |
-
|
125 |
-
class TextEncoder(nn.Module):
|
126 |
-
def __init__(self,
|
127 |
-
out_channels,
|
128 |
-
hidden_channels,
|
129 |
-
kernel_size,
|
130 |
-
n_layers,
|
131 |
-
gin_channels=0,
|
132 |
-
filter_channels=None,
|
133 |
-
n_heads=None,
|
134 |
-
p_dropout=None):
|
135 |
-
super().__init__()
|
136 |
-
self.out_channels = out_channels
|
137 |
-
self.hidden_channels = hidden_channels
|
138 |
-
self.kernel_size = kernel_size
|
139 |
-
self.n_layers = n_layers
|
140 |
-
self.gin_channels = gin_channels
|
141 |
-
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
142 |
-
self.f0_emb = nn.Embedding(256, hidden_channels)
|
143 |
-
|
144 |
-
self.enc_ = attentions.Encoder(
|
145 |
-
hidden_channels,
|
146 |
-
filter_channels,
|
147 |
-
n_heads,
|
148 |
-
n_layers,
|
149 |
-
kernel_size,
|
150 |
-
p_dropout)
|
151 |
-
|
152 |
-
def forward(self, x, x_mask, f0=None, z=None):
|
153 |
-
x = x + self.f0_emb(f0).transpose(1, 2)
|
154 |
-
x = self.enc_(x * x_mask, x_mask)
|
155 |
-
stats = self.proj(x) * x_mask
|
156 |
-
m, logs = torch.split(stats, self.out_channels, dim=1)
|
157 |
-
z = (m + z * torch.exp(logs)) * x_mask
|
158 |
-
|
159 |
-
return z, m, logs, x_mask
|
160 |
-
|
161 |
-
|
162 |
-
class F0Decoder(nn.Module):
|
163 |
-
def __init__(self,
|
164 |
-
out_channels,
|
165 |
-
hidden_channels,
|
166 |
-
filter_channels,
|
167 |
-
n_heads,
|
168 |
-
n_layers,
|
169 |
-
kernel_size,
|
170 |
-
p_dropout,
|
171 |
-
spk_channels=0):
|
172 |
-
super().__init__()
|
173 |
-
self.out_channels = out_channels
|
174 |
-
self.hidden_channels = hidden_channels
|
175 |
-
self.filter_channels = filter_channels
|
176 |
-
self.n_heads = n_heads
|
177 |
-
self.n_layers = n_layers
|
178 |
-
self.kernel_size = kernel_size
|
179 |
-
self.p_dropout = p_dropout
|
180 |
-
self.spk_channels = spk_channels
|
181 |
-
|
182 |
-
self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
|
183 |
-
self.decoder = attentions.FFT(
|
184 |
-
hidden_channels,
|
185 |
-
filter_channels,
|
186 |
-
n_heads,
|
187 |
-
n_layers,
|
188 |
-
kernel_size,
|
189 |
-
p_dropout)
|
190 |
-
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
191 |
-
self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
|
192 |
-
self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
|
193 |
-
|
194 |
-
def forward(self, x, norm_f0, x_mask, spk_emb=None):
|
195 |
-
x = torch.detach(x)
|
196 |
-
if (spk_emb is not None):
|
197 |
-
x = x + self.cond(spk_emb)
|
198 |
-
x += self.f0_prenet(norm_f0)
|
199 |
-
x = self.prenet(x) * x_mask
|
200 |
-
x = self.decoder(x * x_mask, x_mask)
|
201 |
-
x = self.proj(x) * x_mask
|
202 |
-
return x
|
203 |
-
|
204 |
-
|
205 |
-
class SynthesizerTrn(nn.Module):
|
206 |
-
"""
|
207 |
-
Synthesizer for Training
|
208 |
-
"""
|
209 |
-
|
210 |
-
def __init__(self,
|
211 |
-
spec_channels,
|
212 |
-
segment_size,
|
213 |
-
inter_channels,
|
214 |
-
hidden_channels,
|
215 |
-
filter_channels,
|
216 |
-
n_heads,
|
217 |
-
n_layers,
|
218 |
-
kernel_size,
|
219 |
-
p_dropout,
|
220 |
-
resblock,
|
221 |
-
resblock_kernel_sizes,
|
222 |
-
resblock_dilation_sizes,
|
223 |
-
upsample_rates,
|
224 |
-
upsample_initial_channel,
|
225 |
-
upsample_kernel_sizes,
|
226 |
-
gin_channels,
|
227 |
-
ssl_dim,
|
228 |
-
n_speakers,
|
229 |
-
sampling_rate=44100,
|
230 |
-
vol_embedding=False,
|
231 |
-
vocoder_name = "nsf-hifigan",
|
232 |
-
use_depthwise_conv = False,
|
233 |
-
use_automatic_f0_prediction = True,
|
234 |
-
flow_share_parameter = False,
|
235 |
-
n_flow_layer = 4,
|
236 |
-
n_layers_trans_flow = 3,
|
237 |
-
use_transformer_flow = False,
|
238 |
-
**kwargs):
|
239 |
-
|
240 |
-
super().__init__()
|
241 |
-
self.spec_channels = spec_channels
|
242 |
-
self.inter_channels = inter_channels
|
243 |
-
self.hidden_channels = hidden_channels
|
244 |
-
self.filter_channels = filter_channels
|
245 |
-
self.n_heads = n_heads
|
246 |
-
self.n_layers = n_layers
|
247 |
-
self.kernel_size = kernel_size
|
248 |
-
self.p_dropout = p_dropout
|
249 |
-
self.resblock = resblock
|
250 |
-
self.resblock_kernel_sizes = resblock_kernel_sizes
|
251 |
-
self.resblock_dilation_sizes = resblock_dilation_sizes
|
252 |
-
self.upsample_rates = upsample_rates
|
253 |
-
self.upsample_initial_channel = upsample_initial_channel
|
254 |
-
self.upsample_kernel_sizes = upsample_kernel_sizes
|
255 |
-
self.segment_size = segment_size
|
256 |
-
self.gin_channels = gin_channels
|
257 |
-
self.ssl_dim = ssl_dim
|
258 |
-
self.vol_embedding = vol_embedding
|
259 |
-
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
260 |
-
self.use_depthwise_conv = use_depthwise_conv
|
261 |
-
self.use_automatic_f0_prediction = use_automatic_f0_prediction
|
262 |
-
self.n_layers_trans_flow = n_layers_trans_flow
|
263 |
-
if vol_embedding:
|
264 |
-
self.emb_vol = nn.Linear(1, hidden_channels)
|
265 |
-
|
266 |
-
self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
|
267 |
-
|
268 |
-
self.enc_p = TextEncoder(
|
269 |
-
inter_channels,
|
270 |
-
hidden_channels,
|
271 |
-
filter_channels=filter_channels,
|
272 |
-
n_heads=n_heads,
|
273 |
-
n_layers=n_layers,
|
274 |
-
kernel_size=kernel_size,
|
275 |
-
p_dropout=p_dropout
|
276 |
-
)
|
277 |
-
hps = {
|
278 |
-
"sampling_rate": sampling_rate,
|
279 |
-
"inter_channels": inter_channels,
|
280 |
-
"resblock": resblock,
|
281 |
-
"resblock_kernel_sizes": resblock_kernel_sizes,
|
282 |
-
"resblock_dilation_sizes": resblock_dilation_sizes,
|
283 |
-
"upsample_rates": upsample_rates,
|
284 |
-
"upsample_initial_channel": upsample_initial_channel,
|
285 |
-
"upsample_kernel_sizes": upsample_kernel_sizes,
|
286 |
-
"gin_channels": gin_channels,
|
287 |
-
"use_depthwise_conv":use_depthwise_conv
|
288 |
-
}
|
289 |
-
|
290 |
-
modules.set_Conv1dModel(self.use_depthwise_conv)
|
291 |
-
|
292 |
-
if vocoder_name == "nsf-hifigan":
|
293 |
-
from vdecoder.hifigan.models import Generator
|
294 |
-
self.dec = Generator(h=hps)
|
295 |
-
elif vocoder_name == "nsf-snake-hifigan":
|
296 |
-
from vdecoder.hifiganwithsnake.models import Generator
|
297 |
-
self.dec = Generator(h=hps)
|
298 |
-
else:
|
299 |
-
print("[?] Unkown vocoder: use default(nsf-hifigan)")
|
300 |
-
from vdecoder.hifigan.models import Generator
|
301 |
-
self.dec = Generator(h=hps)
|
302 |
-
|
303 |
-
self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
304 |
-
if use_transformer_flow:
|
305 |
-
self.flow = TransformerCouplingBlock(inter_channels, hidden_channels, filter_channels, n_heads, n_layers_trans_flow, 5, p_dropout, n_flow_layer, gin_channels=gin_channels, share_parameter=flow_share_parameter)
|
306 |
-
else:
|
307 |
-
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, gin_channels=gin_channels, share_parameter=flow_share_parameter)
|
308 |
-
if self.use_automatic_f0_prediction:
|
309 |
-
self.f0_decoder = F0Decoder(
|
310 |
-
1,
|
311 |
-
hidden_channels,
|
312 |
-
filter_channels,
|
313 |
-
n_heads,
|
314 |
-
n_layers,
|
315 |
-
kernel_size,
|
316 |
-
p_dropout,
|
317 |
-
spk_channels=gin_channels
|
318 |
-
)
|
319 |
-
self.emb_uv = nn.Embedding(2, hidden_channels)
|
320 |
-
self.predict_f0 = False
|
321 |
-
self.speaker_map = []
|
322 |
-
self.export_mix = False
|
323 |
-
|
324 |
-
def export_chara_mix(self, speakers_mix):
|
325 |
-
self.speaker_map = torch.zeros((len(speakers_mix), 1, 1, self.gin_channels))
|
326 |
-
i = 0
|
327 |
-
for key in speakers_mix.keys():
|
328 |
-
spkidx = speakers_mix[key]
|
329 |
-
self.speaker_map[i] = self.emb_g(torch.LongTensor([[spkidx]]))
|
330 |
-
i = i + 1
|
331 |
-
self.speaker_map = self.speaker_map.unsqueeze(0)
|
332 |
-
self.export_mix = True
|
333 |
-
|
334 |
-
def forward(self, c, f0, mel2ph, uv, noise=None, g=None, vol = None):
|
335 |
-
decoder_inp = F.pad(c, [0, 0, 1, 0])
|
336 |
-
mel2ph_ = mel2ph.unsqueeze(2).repeat([1, 1, c.shape[-1]])
|
337 |
-
c = torch.gather(decoder_inp, 1, mel2ph_).transpose(1, 2) # [B, T, H]
|
338 |
-
|
339 |
-
if self.export_mix: # [N, S] * [S, B, 1, H]
|
340 |
-
g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
|
341 |
-
g = g * self.speaker_map # [N, S, B, 1, H]
|
342 |
-
g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
|
343 |
-
g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
|
344 |
-
else:
|
345 |
-
if g.dim() == 1:
|
346 |
-
g = g.unsqueeze(0)
|
347 |
-
g = self.emb_g(g).transpose(1, 2)
|
348 |
-
|
349 |
-
x_mask = torch.unsqueeze(torch.ones_like(f0), 1).to(c.dtype)
|
350 |
-
# vol proj
|
351 |
-
|
352 |
-
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
|
353 |
-
|
354 |
-
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) + vol
|
355 |
-
|
356 |
-
if self.use_automatic_f0_prediction and self.predict_f0:
|
357 |
-
lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
|
358 |
-
norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
|
359 |
-
pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
|
360 |
-
f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
|
361 |
-
|
362 |
-
z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), z=noise)
|
363 |
-
z = self.flow(z_p, c_mask, g=g, reverse=True)
|
364 |
-
o = self.dec(z * c_mask, g=g, f0=f0)
|
365 |
-
return o
|
366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train.py
DELETED
@@ -1,329 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
import multiprocessing
|
3 |
-
import os
|
4 |
-
import time
|
5 |
-
|
6 |
-
import torch
|
7 |
-
import torch.distributed as dist
|
8 |
-
import torch.multiprocessing as mp
|
9 |
-
from torch.cuda.amp import GradScaler, autocast
|
10 |
-
from torch.nn import functional as F
|
11 |
-
from torch.nn.parallel import DistributedDataParallel as DDP
|
12 |
-
from torch.utils.data import DataLoader
|
13 |
-
from torch.utils.tensorboard import SummaryWriter
|
14 |
-
|
15 |
-
import modules.commons as commons
|
16 |
-
import utils
|
17 |
-
from data_utils import TextAudioCollate, TextAudioSpeakerLoader
|
18 |
-
from models import (
|
19 |
-
MultiPeriodDiscriminator,
|
20 |
-
SynthesizerTrn,
|
21 |
-
)
|
22 |
-
from modules.losses import discriminator_loss, feature_loss, generator_loss, kl_loss
|
23 |
-
from modules.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
|
24 |
-
|
25 |
-
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
26 |
-
logging.getLogger('numba').setLevel(logging.WARNING)
|
27 |
-
|
28 |
-
torch.backends.cudnn.benchmark = True
|
29 |
-
global_step = 0
|
30 |
-
start_time = time.time()
|
31 |
-
|
32 |
-
# os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO'
|
33 |
-
|
34 |
-
|
35 |
-
def main():
|
36 |
-
"""Assume Single Node Multi GPUs Training Only"""
|
37 |
-
assert torch.cuda.is_available(), "CPU training is not allowed."
|
38 |
-
hps = utils.get_hparams()
|
39 |
-
|
40 |
-
n_gpus = torch.cuda.device_count()
|
41 |
-
os.environ['MASTER_ADDR'] = 'localhost'
|
42 |
-
os.environ['MASTER_PORT'] = hps.train.port
|
43 |
-
|
44 |
-
mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
|
45 |
-
|
46 |
-
|
47 |
-
def run(rank, n_gpus, hps):
|
48 |
-
global global_step
|
49 |
-
if rank == 0:
|
50 |
-
logger = utils.get_logger(hps.model_dir)
|
51 |
-
logger.info(hps)
|
52 |
-
utils.check_git_hash(hps.model_dir)
|
53 |
-
writer = SummaryWriter(log_dir=hps.model_dir)
|
54 |
-
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
|
55 |
-
|
56 |
-
# for pytorch on win, backend use gloo
|
57 |
-
dist.init_process_group(backend= 'gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus, rank=rank)
|
58 |
-
torch.manual_seed(hps.train.seed)
|
59 |
-
torch.cuda.set_device(rank)
|
60 |
-
collate_fn = TextAudioCollate()
|
61 |
-
all_in_mem = hps.train.all_in_mem # If you have enough memory, turn on this option to avoid disk IO and speed up training.
|
62 |
-
train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps, all_in_mem=all_in_mem)
|
63 |
-
num_workers = 5 if multiprocessing.cpu_count() > 4 else multiprocessing.cpu_count()
|
64 |
-
if all_in_mem:
|
65 |
-
num_workers = 0
|
66 |
-
train_loader = DataLoader(train_dataset, num_workers=num_workers, shuffle=False, pin_memory=True,
|
67 |
-
batch_size=hps.train.batch_size, collate_fn=collate_fn)
|
68 |
-
if rank == 0:
|
69 |
-
eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps, all_in_mem=all_in_mem,vol_aug = False)
|
70 |
-
eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False,
|
71 |
-
batch_size=1, pin_memory=False,
|
72 |
-
drop_last=False, collate_fn=collate_fn)
|
73 |
-
|
74 |
-
net_g = SynthesizerTrn(
|
75 |
-
hps.data.filter_length // 2 + 1,
|
76 |
-
hps.train.segment_size // hps.data.hop_length,
|
77 |
-
**hps.model).cuda(rank)
|
78 |
-
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
|
79 |
-
optim_g = torch.optim.AdamW(
|
80 |
-
net_g.parameters(),
|
81 |
-
hps.train.learning_rate,
|
82 |
-
betas=hps.train.betas,
|
83 |
-
eps=hps.train.eps)
|
84 |
-
optim_d = torch.optim.AdamW(
|
85 |
-
net_d.parameters(),
|
86 |
-
hps.train.learning_rate,
|
87 |
-
betas=hps.train.betas,
|
88 |
-
eps=hps.train.eps)
|
89 |
-
net_g = DDP(net_g, device_ids=[rank]) # , find_unused_parameters=True)
|
90 |
-
net_d = DDP(net_d, device_ids=[rank])
|
91 |
-
|
92 |
-
skip_optimizer = False
|
93 |
-
try:
|
94 |
-
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g,
|
95 |
-
optim_g, skip_optimizer)
|
96 |
-
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d,
|
97 |
-
optim_d, skip_optimizer)
|
98 |
-
epoch_str = max(epoch_str, 1)
|
99 |
-
name=utils.latest_checkpoint_path(hps.model_dir, "D_*.pth")
|
100 |
-
global_step=int(name[name.rfind("_")+1:name.rfind(".")])+1
|
101 |
-
#global_step = (epoch_str - 1) * len(train_loader)
|
102 |
-
except Exception:
|
103 |
-
print("load old checkpoint failed...")
|
104 |
-
epoch_str = 1
|
105 |
-
global_step = 0
|
106 |
-
if skip_optimizer:
|
107 |
-
epoch_str = 1
|
108 |
-
global_step = 0
|
109 |
-
|
110 |
-
warmup_epoch = hps.train.warmup_epochs
|
111 |
-
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
|
112 |
-
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
|
113 |
-
|
114 |
-
scaler = GradScaler(enabled=hps.train.fp16_run)
|
115 |
-
|
116 |
-
for epoch in range(epoch_str, hps.train.epochs + 1):
|
117 |
-
# set up warm-up learning rate
|
118 |
-
if epoch <= warmup_epoch:
|
119 |
-
for param_group in optim_g.param_groups:
|
120 |
-
param_group['lr'] = hps.train.learning_rate / warmup_epoch * epoch
|
121 |
-
for param_group in optim_d.param_groups:
|
122 |
-
param_group['lr'] = hps.train.learning_rate / warmup_epoch * epoch
|
123 |
-
# training
|
124 |
-
if rank == 0:
|
125 |
-
train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler,
|
126 |
-
[train_loader, eval_loader], logger, [writer, writer_eval])
|
127 |
-
else:
|
128 |
-
train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler,
|
129 |
-
[train_loader, None], None, None)
|
130 |
-
# update learning rate
|
131 |
-
scheduler_g.step()
|
132 |
-
scheduler_d.step()
|
133 |
-
|
134 |
-
|
135 |
-
def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers):
|
136 |
-
net_g, net_d = nets
|
137 |
-
optim_g, optim_d = optims
|
138 |
-
scheduler_g, scheduler_d = schedulers
|
139 |
-
train_loader, eval_loader = loaders
|
140 |
-
if writers is not None:
|
141 |
-
writer, writer_eval = writers
|
142 |
-
|
143 |
-
half_type = torch.bfloat16 if hps.train.half_type=="bf16" else torch.float16
|
144 |
-
|
145 |
-
# train_loader.batch_sampler.set_epoch(epoch)
|
146 |
-
global global_step
|
147 |
-
|
148 |
-
net_g.train()
|
149 |
-
net_d.train()
|
150 |
-
for batch_idx, items in enumerate(train_loader):
|
151 |
-
c, f0, spec, y, spk, lengths, uv,volume = items
|
152 |
-
g = spk.cuda(rank, non_blocking=True)
|
153 |
-
spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True)
|
154 |
-
c = c.cuda(rank, non_blocking=True)
|
155 |
-
f0 = f0.cuda(rank, non_blocking=True)
|
156 |
-
uv = uv.cuda(rank, non_blocking=True)
|
157 |
-
lengths = lengths.cuda(rank, non_blocking=True)
|
158 |
-
mel = spec_to_mel_torch(
|
159 |
-
spec,
|
160 |
-
hps.data.filter_length,
|
161 |
-
hps.data.n_mel_channels,
|
162 |
-
hps.data.sampling_rate,
|
163 |
-
hps.data.mel_fmin,
|
164 |
-
hps.data.mel_fmax)
|
165 |
-
|
166 |
-
with autocast(enabled=hps.train.fp16_run, dtype=half_type):
|
167 |
-
y_hat, ids_slice, z_mask, \
|
168 |
-
(z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 = net_g(c, f0, uv, spec, g=g, c_lengths=lengths,
|
169 |
-
spec_lengths=lengths,vol = volume)
|
170 |
-
|
171 |
-
y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
|
172 |
-
y_hat_mel = mel_spectrogram_torch(
|
173 |
-
y_hat.squeeze(1),
|
174 |
-
hps.data.filter_length,
|
175 |
-
hps.data.n_mel_channels,
|
176 |
-
hps.data.sampling_rate,
|
177 |
-
hps.data.hop_length,
|
178 |
-
hps.data.win_length,
|
179 |
-
hps.data.mel_fmin,
|
180 |
-
hps.data.mel_fmax
|
181 |
-
)
|
182 |
-
y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice
|
183 |
-
|
184 |
-
# Discriminator
|
185 |
-
y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
|
186 |
-
|
187 |
-
with autocast(enabled=False, dtype=half_type):
|
188 |
-
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
|
189 |
-
loss_disc_all = loss_disc
|
190 |
-
|
191 |
-
optim_d.zero_grad()
|
192 |
-
scaler.scale(loss_disc_all).backward()
|
193 |
-
scaler.unscale_(optim_d)
|
194 |
-
grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
|
195 |
-
scaler.step(optim_d)
|
196 |
-
|
197 |
-
|
198 |
-
with autocast(enabled=hps.train.fp16_run, dtype=half_type):
|
199 |
-
# Generator
|
200 |
-
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
|
201 |
-
with autocast(enabled=False, dtype=half_type):
|
202 |
-
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
|
203 |
-
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
|
204 |
-
loss_fm = feature_loss(fmap_r, fmap_g)
|
205 |
-
loss_gen, losses_gen = generator_loss(y_d_hat_g)
|
206 |
-
loss_lf0 = F.mse_loss(pred_lf0, lf0) if net_g.module.use_automatic_f0_prediction else 0
|
207 |
-
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_lf0
|
208 |
-
optim_g.zero_grad()
|
209 |
-
scaler.scale(loss_gen_all).backward()
|
210 |
-
scaler.unscale_(optim_g)
|
211 |
-
grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
|
212 |
-
scaler.step(optim_g)
|
213 |
-
scaler.update()
|
214 |
-
|
215 |
-
if rank == 0:
|
216 |
-
if global_step % hps.train.log_interval == 0:
|
217 |
-
lr = optim_g.param_groups[0]['lr']
|
218 |
-
losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_kl]
|
219 |
-
reference_loss=0
|
220 |
-
for i in losses:
|
221 |
-
reference_loss += i
|
222 |
-
logger.info('Train Epoch: {} [{:.0f}%]'.format(
|
223 |
-
epoch,
|
224 |
-
100. * batch_idx / len(train_loader)))
|
225 |
-
logger.info(f"Losses: {[x.item() for x in losses]}, step: {global_step}, lr: {lr}, reference_loss: {reference_loss}")
|
226 |
-
|
227 |
-
scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr,
|
228 |
-
"grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g}
|
229 |
-
scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl,
|
230 |
-
"loss/g/lf0": loss_lf0})
|
231 |
-
|
232 |
-
# scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
|
233 |
-
# scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
|
234 |
-
# scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
|
235 |
-
image_dict = {
|
236 |
-
"slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
|
237 |
-
"slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
|
238 |
-
"all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy())
|
239 |
-
}
|
240 |
-
|
241 |
-
if net_g.module.use_automatic_f0_prediction:
|
242 |
-
image_dict.update({
|
243 |
-
"all/lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(),
|
244 |
-
pred_lf0[0, 0, :].detach().cpu().numpy()),
|
245 |
-
"all/norm_lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(),
|
246 |
-
norm_lf0[0, 0, :].detach().cpu().numpy())
|
247 |
-
})
|
248 |
-
|
249 |
-
utils.summarize(
|
250 |
-
writer=writer,
|
251 |
-
global_step=global_step,
|
252 |
-
images=image_dict,
|
253 |
-
scalars=scalar_dict
|
254 |
-
)
|
255 |
-
|
256 |
-
if global_step % hps.train.eval_interval == 0:
|
257 |
-
evaluate(hps, net_g, eval_loader, writer_eval)
|
258 |
-
utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch,
|
259 |
-
os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
|
260 |
-
utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch,
|
261 |
-
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
|
262 |
-
keep_ckpts = getattr(hps.train, 'keep_ckpts', 0)
|
263 |
-
if keep_ckpts > 0:
|
264 |
-
utils.clean_checkpoints(path_to_models=hps.model_dir, n_ckpts_to_keep=keep_ckpts, sort_by_time=True)
|
265 |
-
|
266 |
-
global_step += 1
|
267 |
-
|
268 |
-
if rank == 0:
|
269 |
-
global start_time
|
270 |
-
now = time.time()
|
271 |
-
durtaion = format(now - start_time, '.2f')
|
272 |
-
logger.info(f'====> Epoch: {epoch}, cost {durtaion} s')
|
273 |
-
start_time = now
|
274 |
-
|
275 |
-
|
276 |
-
def evaluate(hps, generator, eval_loader, writer_eval):
|
277 |
-
generator.eval()
|
278 |
-
image_dict = {}
|
279 |
-
audio_dict = {}
|
280 |
-
with torch.no_grad():
|
281 |
-
for batch_idx, items in enumerate(eval_loader):
|
282 |
-
c, f0, spec, y, spk, _, uv,volume = items
|
283 |
-
g = spk[:1].cuda(0)
|
284 |
-
spec, y = spec[:1].cuda(0), y[:1].cuda(0)
|
285 |
-
c = c[:1].cuda(0)
|
286 |
-
f0 = f0[:1].cuda(0)
|
287 |
-
uv= uv[:1].cuda(0)
|
288 |
-
if volume is not None:
|
289 |
-
volume = volume[:1].cuda(0)
|
290 |
-
mel = spec_to_mel_torch(
|
291 |
-
spec,
|
292 |
-
hps.data.filter_length,
|
293 |
-
hps.data.n_mel_channels,
|
294 |
-
hps.data.sampling_rate,
|
295 |
-
hps.data.mel_fmin,
|
296 |
-
hps.data.mel_fmax)
|
297 |
-
y_hat,_ = generator.module.infer(c, f0, uv, g=g,vol = volume)
|
298 |
-
|
299 |
-
y_hat_mel = mel_spectrogram_torch(
|
300 |
-
y_hat.squeeze(1).float(),
|
301 |
-
hps.data.filter_length,
|
302 |
-
hps.data.n_mel_channels,
|
303 |
-
hps.data.sampling_rate,
|
304 |
-
hps.data.hop_length,
|
305 |
-
hps.data.win_length,
|
306 |
-
hps.data.mel_fmin,
|
307 |
-
hps.data.mel_fmax
|
308 |
-
)
|
309 |
-
|
310 |
-
audio_dict.update({
|
311 |
-
f"gen/audio_{batch_idx}": y_hat[0],
|
312 |
-
f"gt/audio_{batch_idx}": y[0]
|
313 |
-
})
|
314 |
-
image_dict.update({
|
315 |
-
"gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()),
|
316 |
-
"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())
|
317 |
-
})
|
318 |
-
utils.summarize(
|
319 |
-
writer=writer_eval,
|
320 |
-
global_step=global_step,
|
321 |
-
images=image_dict,
|
322 |
-
audios=audio_dict,
|
323 |
-
audio_sampling_rate=hps.data.sampling_rate
|
324 |
-
)
|
325 |
-
generator.train()
|
326 |
-
|
327 |
-
|
328 |
-
if __name__ == "__main__":
|
329 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_diff.py
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
|
3 |
-
import torch
|
4 |
-
from loguru import logger
|
5 |
-
from torch.optim import lr_scheduler
|
6 |
-
|
7 |
-
from diffusion.data_loaders import get_data_loaders
|
8 |
-
from diffusion.logger import utils
|
9 |
-
from diffusion.solver import train
|
10 |
-
from diffusion.unit2mel import Unit2Mel
|
11 |
-
from diffusion.vocoder import Vocoder
|
12 |
-
|
13 |
-
|
14 |
-
def parse_args(args=None, namespace=None):
|
15 |
-
"""Parse command-line arguments."""
|
16 |
-
parser = argparse.ArgumentParser()
|
17 |
-
parser.add_argument(
|
18 |
-
"-c",
|
19 |
-
"--config",
|
20 |
-
type=str,
|
21 |
-
required=True,
|
22 |
-
help="path to the config file")
|
23 |
-
return parser.parse_args(args=args, namespace=namespace)
|
24 |
-
|
25 |
-
|
26 |
-
if __name__ == '__main__':
|
27 |
-
# parse commands
|
28 |
-
cmd = parse_args()
|
29 |
-
|
30 |
-
# load config
|
31 |
-
args = utils.load_config(cmd.config)
|
32 |
-
logger.info(' > config:'+ cmd.config)
|
33 |
-
logger.info(' > exp:'+ args.env.expdir)
|
34 |
-
|
35 |
-
# load vocoder
|
36 |
-
vocoder = Vocoder(args.vocoder.type, args.vocoder.ckpt, device=args.device)
|
37 |
-
|
38 |
-
# load model
|
39 |
-
model = Unit2Mel(
|
40 |
-
args.data.encoder_out_channels,
|
41 |
-
args.model.n_spk,
|
42 |
-
args.model.use_pitch_aug,
|
43 |
-
vocoder.dimension,
|
44 |
-
args.model.n_layers,
|
45 |
-
args.model.n_chans,
|
46 |
-
args.model.n_hidden,
|
47 |
-
args.model.timesteps,
|
48 |
-
args.model.k_step_max
|
49 |
-
)
|
50 |
-
|
51 |
-
logger.info(f' > Now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}')
|
52 |
-
|
53 |
-
# load parameters
|
54 |
-
optimizer = torch.optim.AdamW(model.parameters())
|
55 |
-
initial_global_step, model, optimizer = utils.load_model(args.env.expdir, model, optimizer, device=args.device)
|
56 |
-
for param_group in optimizer.param_groups:
|
57 |
-
param_group['initial_lr'] = args.train.lr
|
58 |
-
param_group['lr'] = args.train.lr * (args.train.gamma ** max(((initial_global_step-2)//args.train.decay_step),0) )
|
59 |
-
param_group['weight_decay'] = args.train.weight_decay
|
60 |
-
scheduler = lr_scheduler.StepLR(optimizer, step_size=args.train.decay_step, gamma=args.train.gamma,last_epoch=initial_global_step-2)
|
61 |
-
|
62 |
-
# device
|
63 |
-
if args.device == 'cuda':
|
64 |
-
torch.cuda.set_device(args.env.gpu_id)
|
65 |
-
model.to(args.device)
|
66 |
-
|
67 |
-
for state in optimizer.state.values():
|
68 |
-
for k, v in state.items():
|
69 |
-
if torch.is_tensor(v):
|
70 |
-
state[k] = v.to(args.device)
|
71 |
-
|
72 |
-
# datas
|
73 |
-
loader_train, loader_valid = get_data_loaders(args, whole_audio=False)
|
74 |
-
|
75 |
-
# run
|
76 |
-
train(args, initial_global_step, model, optimizer, scheduler, vocoder, loader_train, loader_valid)
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_index.py
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import os
|
3 |
-
import pickle
|
4 |
-
|
5 |
-
import utils
|
6 |
-
|
7 |
-
if __name__ == "__main__":
|
8 |
-
parser = argparse.ArgumentParser()
|
9 |
-
parser.add_argument(
|
10 |
-
"--root_dir", type=str, default="dataset/44k", help="path to root dir"
|
11 |
-
)
|
12 |
-
parser.add_argument('-c', '--config', type=str, default="./configs/config.json",
|
13 |
-
help='JSON file for configuration')
|
14 |
-
parser.add_argument(
|
15 |
-
"--output_dir", type=str, default="logs/44k", help="path to output dir"
|
16 |
-
)
|
17 |
-
|
18 |
-
args = parser.parse_args()
|
19 |
-
|
20 |
-
hps = utils.get_hparams_from_file(args.config)
|
21 |
-
spk_dic = hps.spk
|
22 |
-
result = {}
|
23 |
-
|
24 |
-
for k,v in spk_dic.items():
|
25 |
-
print(f"now, index {k} feature...")
|
26 |
-
index = utils.train_index(k,args.root_dir)
|
27 |
-
result[v] = index
|
28 |
-
|
29 |
-
with open(os.path.join(args.output_dir,"feature_and_index.pkl"),"wb") as f:
|
30 |
-
pickle.dump(result,f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
trained/put_trained_checkpoints_here
DELETED
File without changes
|
wav_upload.py
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import os
|
3 |
-
import shutil
|
4 |
-
|
5 |
-
from google.colab import files
|
6 |
-
|
7 |
-
if __name__ == "__main__":
|
8 |
-
parser = argparse.ArgumentParser()
|
9 |
-
parser.add_argument("--type", type=str, required=True, help="type of file to upload")
|
10 |
-
args = parser.parse_args()
|
11 |
-
file_type = args.type
|
12 |
-
|
13 |
-
basepath = os.getcwd()
|
14 |
-
uploaded = files.upload() # 上传文件
|
15 |
-
assert(file_type in ['zip', 'audio'])
|
16 |
-
if file_type == "zip":
|
17 |
-
upload_path = "./upload/"
|
18 |
-
for filename in uploaded.keys():
|
19 |
-
#将上传的文件移动到指定的位置上
|
20 |
-
shutil.move(os.path.join(basepath, filename), os.path.join(upload_path, "userzip.zip"))
|
21 |
-
elif file_type == "audio":
|
22 |
-
upload_path = "./raw/"
|
23 |
-
for filename in uploaded.keys():
|
24 |
-
#将上传的文件移动到指定的位置上
|
25 |
-
shutil.move(os.path.join(basepath, filename), os.path.join(upload_path, filename))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
webUI.py
DELETED
@@ -1,430 +0,0 @@
|
|
1 |
-
import glob
|
2 |
-
import json
|
3 |
-
import logging
|
4 |
-
import os
|
5 |
-
import re
|
6 |
-
import subprocess
|
7 |
-
import sys
|
8 |
-
import time
|
9 |
-
import traceback
|
10 |
-
from itertools import chain
|
11 |
-
from pathlib import Path
|
12 |
-
|
13 |
-
# os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt")
|
14 |
-
import gradio as gr
|
15 |
-
import librosa
|
16 |
-
import numpy as np
|
17 |
-
import soundfile
|
18 |
-
import torch
|
19 |
-
|
20 |
-
from compress_model import removeOptimizer
|
21 |
-
from edgetts.tts_voices import SUPPORTED_LANGUAGES
|
22 |
-
from inference.infer_tool import Svc
|
23 |
-
from utils import mix_model
|
24 |
-
|
25 |
-
logging.getLogger('numba').setLevel(logging.WARNING)
|
26 |
-
logging.getLogger('markdown_it').setLevel(logging.WARNING)
|
27 |
-
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
28 |
-
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
29 |
-
logging.getLogger('multipart').setLevel(logging.WARNING)
|
30 |
-
|
31 |
-
model = None
|
32 |
-
spk = None
|
33 |
-
debug = False
|
34 |
-
|
35 |
-
local_model_root = './trained'
|
36 |
-
|
37 |
-
cuda = {}
|
38 |
-
if torch.cuda.is_available():
|
39 |
-
for i in range(torch.cuda.device_count()):
|
40 |
-
device_name = torch.cuda.get_device_properties(i).name
|
41 |
-
cuda[f"CUDA:{i} {device_name}"] = f"cuda:{i}"
|
42 |
-
|
43 |
-
def upload_mix_append_file(files,sfiles):
|
44 |
-
try:
|
45 |
-
if(sfiles is None):
|
46 |
-
file_paths = [file.name for file in files]
|
47 |
-
else:
|
48 |
-
file_paths = [file.name for file in chain(files,sfiles)]
|
49 |
-
p = {file:100 for file in file_paths}
|
50 |
-
return file_paths,mix_model_output1.update(value=json.dumps(p,indent=2))
|
51 |
-
except Exception as e:
|
52 |
-
if debug:
|
53 |
-
traceback.print_exc()
|
54 |
-
raise gr.Error(e)
|
55 |
-
|
56 |
-
def mix_submit_click(js,mode):
|
57 |
-
try:
|
58 |
-
assert js.lstrip()!=""
|
59 |
-
modes = {"凸组合":0, "线性组合":1}
|
60 |
-
mode = modes[mode]
|
61 |
-
data = json.loads(js)
|
62 |
-
data = list(data.items())
|
63 |
-
model_path,mix_rate = zip(*data)
|
64 |
-
path = mix_model(model_path,mix_rate,mode)
|
65 |
-
return f"成功,文件被保存在了{path}"
|
66 |
-
except Exception as e:
|
67 |
-
if debug:
|
68 |
-
traceback.print_exc()
|
69 |
-
raise gr.Error(e)
|
70 |
-
|
71 |
-
def updata_mix_info(files):
|
72 |
-
try:
|
73 |
-
if files is None :
|
74 |
-
return mix_model_output1.update(value="")
|
75 |
-
p = {file.name:100 for file in files}
|
76 |
-
return mix_model_output1.update(value=json.dumps(p,indent=2))
|
77 |
-
except Exception as e:
|
78 |
-
if debug:
|
79 |
-
traceback.print_exc()
|
80 |
-
raise gr.Error(e)
|
81 |
-
|
82 |
-
def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance,diff_model_path,diff_config_path,only_diffusion,use_spk_mix,local_model_enabled,local_model_selection):
|
83 |
-
global model
|
84 |
-
try:
|
85 |
-
device = cuda[device] if "CUDA" in device else device
|
86 |
-
cluster_filepath = os.path.split(cluster_model_path.name) if cluster_model_path is not None else "no_cluster"
|
87 |
-
# get model and config path
|
88 |
-
if (local_model_enabled):
|
89 |
-
# local path
|
90 |
-
model_path = glob.glob(os.path.join(local_model_selection, '*.pth'))[0]
|
91 |
-
config_path = glob.glob(os.path.join(local_model_selection, '*.json'))[0]
|
92 |
-
else:
|
93 |
-
# upload from webpage
|
94 |
-
model_path = model_path.name
|
95 |
-
config_path = config_path.name
|
96 |
-
fr = ".pkl" in cluster_filepath[1]
|
97 |
-
model = Svc(model_path,
|
98 |
-
config_path,
|
99 |
-
device=device if device != "Auto" else None,
|
100 |
-
cluster_model_path = cluster_model_path.name if cluster_model_path is not None else "",
|
101 |
-
nsf_hifigan_enhance=enhance,
|
102 |
-
diffusion_model_path = diff_model_path.name if diff_model_path is not None else "",
|
103 |
-
diffusion_config_path = diff_config_path.name if diff_config_path is not None else "",
|
104 |
-
shallow_diffusion = True if diff_model_path is not None else False,
|
105 |
-
only_diffusion = only_diffusion,
|
106 |
-
spk_mix_enable = use_spk_mix,
|
107 |
-
feature_retrieval = fr
|
108 |
-
)
|
109 |
-
spks = list(model.spk2id.keys())
|
110 |
-
device_name = torch.cuda.get_device_properties(model.dev).name if "cuda" in str(model.dev) else str(model.dev)
|
111 |
-
msg = f"成功加载模型到设备{device_name}上\n"
|
112 |
-
if cluster_model_path is None:
|
113 |
-
msg += "未加载聚类模型或特征检索模型\n"
|
114 |
-
elif fr:
|
115 |
-
msg += f"特征检索模型{cluster_filepath[1]}加载成功\n"
|
116 |
-
else:
|
117 |
-
msg += f"聚类模型{cluster_filepath[1]}加载成功\n"
|
118 |
-
if diff_model_path is None:
|
119 |
-
msg += "未加载扩散模型\n"
|
120 |
-
else:
|
121 |
-
msg += f"扩散模型{diff_model_path.name}加载成功\n"
|
122 |
-
msg += "当前模型的可用音色:\n"
|
123 |
-
for i in spks:
|
124 |
-
msg += i + " "
|
125 |
-
return sid.update(choices = spks,value=spks[0]), msg
|
126 |
-
except Exception as e:
|
127 |
-
if debug:
|
128 |
-
traceback.print_exc()
|
129 |
-
raise gr.Error(e)
|
130 |
-
|
131 |
-
|
132 |
-
def modelUnload():
|
133 |
-
global model
|
134 |
-
if model is None:
|
135 |
-
return sid.update(choices = [],value=""),"没有模型需要卸载!"
|
136 |
-
else:
|
137 |
-
model.unload_model()
|
138 |
-
model = None
|
139 |
-
torch.cuda.empty_cache()
|
140 |
-
return sid.update(choices = [],value=""),"模型卸载完毕!"
|
141 |
-
|
142 |
-
def vc_infer(output_format, sid, audio_path, truncated_basename, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment):
|
143 |
-
global model
|
144 |
-
_audio = model.slice_inference(
|
145 |
-
audio_path,
|
146 |
-
sid,
|
147 |
-
vc_transform,
|
148 |
-
slice_db,
|
149 |
-
cluster_ratio,
|
150 |
-
auto_f0,
|
151 |
-
noise_scale,
|
152 |
-
pad_seconds,
|
153 |
-
cl_num,
|
154 |
-
lg_num,
|
155 |
-
lgr_num,
|
156 |
-
f0_predictor,
|
157 |
-
enhancer_adaptive_key,
|
158 |
-
cr_threshold,
|
159 |
-
k_step,
|
160 |
-
use_spk_mix,
|
161 |
-
second_encoding,
|
162 |
-
loudness_envelope_adjustment
|
163 |
-
)
|
164 |
-
model.clear_empty()
|
165 |
-
#构建保存文件的路径,并保存到results文件夹内
|
166 |
-
str(int(time.time()))
|
167 |
-
if not os.path.exists("results"):
|
168 |
-
os.makedirs("results")
|
169 |
-
key = "auto" if auto_f0 else f"{int(vc_transform)}key"
|
170 |
-
cluster = "_" if cluster_ratio == 0 else f"_{cluster_ratio}_"
|
171 |
-
isdiffusion = "sovits"
|
172 |
-
if model.shallow_diffusion:
|
173 |
-
isdiffusion = "sovdiff"
|
174 |
-
|
175 |
-
if model.only_diffusion:
|
176 |
-
isdiffusion = "diff"
|
177 |
-
|
178 |
-
output_file_name = 'result_'+truncated_basename+f'_{sid}_{key}{cluster}{isdiffusion}.{output_format}'
|
179 |
-
output_file = os.path.join("results", output_file_name)
|
180 |
-
soundfile.write(output_file, _audio, model.target_sample, format=output_format)
|
181 |
-
return output_file
|
182 |
-
|
183 |
-
def vc_fn(sid, input_audio, output_format, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment):
|
184 |
-
global model
|
185 |
-
try:
|
186 |
-
if input_audio is None:
|
187 |
-
return "You need to upload an audio", None
|
188 |
-
if model is None:
|
189 |
-
return "You need to upload an model", None
|
190 |
-
if getattr(model, 'cluster_model', None) is None and model.feature_retrieval is False:
|
191 |
-
if cluster_ratio != 0:
|
192 |
-
return "You need to upload an cluster model or feature retrieval model before assigning cluster ratio!", None
|
193 |
-
#print(input_audio)
|
194 |
-
audio, sampling_rate = soundfile.read(input_audio)
|
195 |
-
#print(audio.shape,sampling_rate)
|
196 |
-
if np.issubdtype(audio.dtype, np.integer):
|
197 |
-
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
198 |
-
#print(audio.dtype)
|
199 |
-
if len(audio.shape) > 1:
|
200 |
-
audio = librosa.to_mono(audio.transpose(1, 0))
|
201 |
-
# 未知原因Gradio上传的filepath会有一个奇怪的固定后缀,这里去掉
|
202 |
-
truncated_basename = Path(input_audio).stem[:-6]
|
203 |
-
processed_audio = os.path.join("raw", f"{truncated_basename}.wav")
|
204 |
-
soundfile.write(processed_audio, audio, sampling_rate, format="wav")
|
205 |
-
output_file = vc_infer(output_format, sid, processed_audio, truncated_basename, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment)
|
206 |
-
|
207 |
-
return "Success", output_file
|
208 |
-
except Exception as e:
|
209 |
-
if debug:
|
210 |
-
traceback.print_exc()
|
211 |
-
raise gr.Error(e)
|
212 |
-
|
213 |
-
def text_clear(text):
|
214 |
-
return re.sub(r"[\n\,\(\) ]", "", text)
|
215 |
-
|
216 |
-
def vc_fn2(_text, _lang, _gender, _rate, _volume, sid, output_format, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold, k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment):
|
217 |
-
global model
|
218 |
-
try:
|
219 |
-
if model is None:
|
220 |
-
return "You need to upload an model", None
|
221 |
-
if getattr(model, 'cluster_model', None) is None and model.feature_retrieval is False:
|
222 |
-
if cluster_ratio != 0:
|
223 |
-
return "You need to upload an cluster model or feature retrieval model before assigning cluster ratio!", None
|
224 |
-
_rate = f"+{int(_rate*100)}%" if _rate >= 0 else f"{int(_rate*100)}%"
|
225 |
-
_volume = f"+{int(_volume*100)}%" if _volume >= 0 else f"{int(_volume*100)}%"
|
226 |
-
if _lang == "Auto":
|
227 |
-
_gender = "Male" if _gender == "男" else "Female"
|
228 |
-
subprocess.run([sys.executable, "edgetts/tts.py", _text, _lang, _rate, _volume, _gender])
|
229 |
-
else:
|
230 |
-
subprocess.run([sys.executable, "edgetts/tts.py", _text, _lang, _rate, _volume])
|
231 |
-
target_sr = 44100
|
232 |
-
y, sr = librosa.load("tts.wav")
|
233 |
-
resampled_y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
|
234 |
-
soundfile.write("tts.wav", resampled_y, target_sr, subtype = "PCM_16")
|
235 |
-
input_audio = "tts.wav"
|
236 |
-
#audio, _ = soundfile.read(input_audio)
|
237 |
-
output_file_path = vc_infer(output_format, sid, input_audio, "tts", vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment)
|
238 |
-
os.remove("tts.wav")
|
239 |
-
return "Success", output_file_path
|
240 |
-
except Exception as e:
|
241 |
-
if debug: traceback.print_exc() # noqa: E701
|
242 |
-
raise gr.Error(e)
|
243 |
-
|
244 |
-
def model_compression(_model):
|
245 |
-
if _model == "":
|
246 |
-
return "请先选择要压缩的模型"
|
247 |
-
else:
|
248 |
-
model_path = os.path.split(_model.name)
|
249 |
-
filename, extension = os.path.splitext(model_path[1])
|
250 |
-
output_model_name = f"{filename}_compressed{extension}"
|
251 |
-
output_path = os.path.join(os.getcwd(), output_model_name)
|
252 |
-
removeOptimizer(_model.name, output_path)
|
253 |
-
return f"模型已成功被保存在了{output_path}"
|
254 |
-
|
255 |
-
def scan_local_models():
|
256 |
-
res = []
|
257 |
-
candidates = glob.glob(os.path.join(local_model_root, '**', '*.json'), recursive=True)
|
258 |
-
candidates = set([os.path.dirname(c) for c in candidates])
|
259 |
-
for candidate in candidates:
|
260 |
-
jsons = glob.glob(os.path.join(candidate, '*.json'))
|
261 |
-
pths = glob.glob(os.path.join(candidate, '*.pth'))
|
262 |
-
if (len(jsons) == 1 and len(pths) == 1):
|
263 |
-
# must contain exactly one json and one pth file
|
264 |
-
res.append(candidate)
|
265 |
-
return res
|
266 |
-
|
267 |
-
def local_model_refresh_fn():
|
268 |
-
choices = scan_local_models()
|
269 |
-
return gr.Dropdown.update(choices=choices)
|
270 |
-
|
271 |
-
def debug_change():
|
272 |
-
global debug
|
273 |
-
debug = debug_button.value
|
274 |
-
|
275 |
-
with gr.Blocks(
|
276 |
-
theme=gr.themes.Base(
|
277 |
-
primary_hue = gr.themes.colors.green,
|
278 |
-
font=["Source Sans Pro", "Arial", "sans-serif"],
|
279 |
-
font_mono=['JetBrains mono', "Consolas", 'Courier New']
|
280 |
-
),
|
281 |
-
) as app:
|
282 |
-
with gr.Tabs():
|
283 |
-
with gr.TabItem("推理"):
|
284 |
-
gr.Markdown(value="""
|
285 |
-
So-vits-svc 4.0 推理 webui
|
286 |
-
""")
|
287 |
-
with gr.Row(variant="panel"):
|
288 |
-
with gr.Column():
|
289 |
-
gr.Markdown(value="""
|
290 |
-
<font size=2> 模型设置</font>
|
291 |
-
""")
|
292 |
-
with gr.Tabs():
|
293 |
-
# invisible checkbox that tracks tab status
|
294 |
-
local_model_enabled = gr.Checkbox(value=False, visible=False)
|
295 |
-
with gr.TabItem('上传') as local_model_tab_upload:
|
296 |
-
with gr.Row():
|
297 |
-
model_path = gr.File(label="选择模型文件")
|
298 |
-
config_path = gr.File(label="选择配置文件")
|
299 |
-
with gr.TabItem('本地') as local_model_tab_local:
|
300 |
-
gr.Markdown(f'模型应当放置于{local_model_root}文件夹下')
|
301 |
-
local_model_refresh_btn = gr.Button('刷新本地模型列表')
|
302 |
-
local_model_selection = gr.Dropdown(label='选择模型文件夹', choices=[], interactive=True)
|
303 |
-
with gr.Row():
|
304 |
-
diff_model_path = gr.File(label="选择扩散模型文件")
|
305 |
-
diff_config_path = gr.File(label="选择扩散模型配置文件")
|
306 |
-
cluster_model_path = gr.File(label="选择聚类模型或特征检索文件(没有可以不选)")
|
307 |
-
device = gr.Dropdown(label="推理设备,默认为自动选择CPU和GPU", choices=["Auto",*cuda.keys(),"cpu"], value="Auto")
|
308 |
-
enhance = gr.Checkbox(label="是否使用NSF_HIFIGAN增强,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭", value=False)
|
309 |
-
only_diffusion = gr.Checkbox(label="是否使用全扩散推理,开启后将不使用So-VITS模型,仅使用扩散模型进行完整扩散推理,默认关闭", value=False)
|
310 |
-
with gr.Column():
|
311 |
-
gr.Markdown(value="""
|
312 |
-
<font size=3>左侧文件全部选择完毕后(全部文件模块显示download),点击“加载模型”进行解析:</font>
|
313 |
-
""")
|
314 |
-
model_load_button = gr.Button(value="加载模型", variant="primary")
|
315 |
-
model_unload_button = gr.Button(value="卸载模型", variant="primary")
|
316 |
-
sid = gr.Dropdown(label="音色(说话人)")
|
317 |
-
sid_output = gr.Textbox(label="Output Message")
|
318 |
-
|
319 |
-
|
320 |
-
with gr.Row(variant="panel"):
|
321 |
-
with gr.Column():
|
322 |
-
gr.Markdown(value="""
|
323 |
-
<font size=2> 推理设置</font>
|
324 |
-
""")
|
325 |
-
auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声勾选此项会究极跑调)", value=False)
|
326 |
-
f0_predictor = gr.Dropdown(label="选择F0预测器,可选择crepe,pm,dio,harvest,rmvpe,默认为pm(注意:crepe为原F0使用均值滤波器)", choices=["pm","dio","harvest","crepe","rmvpe"], value="pm")
|
327 |
-
vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
|
328 |
-
cluster_ratio = gr.Number(label="聚类模型/特征检索混合比例,0-1之间,0即不启用聚类/特征检索。使用聚类/特征检索能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0)
|
329 |
-
slice_db = gr.Number(label="切片阈值", value=-40)
|
330 |
-
output_format = gr.Radio(label="音频输出格式", choices=["wav", "flac", "mp3"], value = "wav")
|
331 |
-
noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4)
|
332 |
-
k_step = gr.Slider(label="浅扩散步数,只有使用了扩散模型才有效,步数越大越接近扩散模型的结果", value=100, minimum = 1, maximum = 1000)
|
333 |
-
with gr.Column():
|
334 |
-
pad_seconds = gr.Number(label="推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现", value=0.5)
|
335 |
-
cl_num = gr.Number(label="音频自动切片,0为不切片,单位为秒(s)", value=0)
|
336 |
-
lg_num = gr.Number(label="两端音频切片的交叉淡入长度,如果自动切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,注意,该设置会影响推理速度,单位为秒/s", value=0)
|
337 |
-
lgr_num = gr.Number(label="自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭", value=0.75)
|
338 |
-
enhancer_adaptive_key = gr.Number(label="使增强器适应更高的音域(单位为半音数)|默认为0", value=0)
|
339 |
-
cr_threshold = gr.Number(label="F0过滤阈值,只有启动crepe时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音", value=0.05)
|
340 |
-
loudness_envelope_adjustment = gr.Number(label="输入源响度包络替换输出响度包络融合比例,越靠近1越使用输出响度包络", value = 0)
|
341 |
-
second_encoding = gr.Checkbox(label = "二次编码,浅扩散前会对原始音频进行二次编码,玄学选项,效果时好时差,默认关闭", value=False)
|
342 |
-
use_spk_mix = gr.Checkbox(label = "动态声线融合", value = False, interactive = False)
|
343 |
-
with gr.Tabs():
|
344 |
-
with gr.TabItem("音频转音频"):
|
345 |
-
vc_input3 = gr.Audio(label="选择音频", type="filepath")
|
346 |
-
vc_submit = gr.Button("音频转换", variant="primary")
|
347 |
-
with gr.TabItem("文字转音频"):
|
348 |
-
text2tts=gr.Textbox(label="在此输入要转译的文字。注意,使用该功能建议打开F0预测,不然会很怪")
|
349 |
-
with gr.Row():
|
350 |
-
tts_gender = gr.Radio(label = "说话人性别", choices = ["男","女"], value = "男")
|
351 |
-
tts_lang = gr.Dropdown(label = "选择语言,Auto为根据输入文字自动识别", choices=SUPPORTED_LANGUAGES, value = "Auto")
|
352 |
-
tts_rate = gr.Slider(label = "TTS语音变速(倍速相对值)", minimum = -1, maximum = 3, value = 0, step = 0.1)
|
353 |
-
tts_volume = gr.Slider(label = "TTS语音音量(相对值)", minimum = -1, maximum = 1.5, value = 0, step = 0.1)
|
354 |
-
vc_submit2 = gr.Button("文字转换", variant="primary")
|
355 |
-
with gr.Row():
|
356 |
-
with gr.Column():
|
357 |
-
vc_output1 = gr.Textbox(label="Output Message")
|
358 |
-
with gr.Column():
|
359 |
-
vc_output2 = gr.Audio(label="Output Audio", interactive=False)
|
360 |
-
|
361 |
-
with gr.TabItem("小工具/实验室特性"):
|
362 |
-
gr.Markdown(value="""
|
363 |
-
<font size=2> So-vits-svc 4.0 小工具/实验室特性</font>
|
364 |
-
""")
|
365 |
-
with gr.Tabs():
|
366 |
-
with gr.TabItem("静态声线融合"):
|
367 |
-
gr.Markdown(value="""
|
368 |
-
<font size=2> 介绍:该功能可以将多个声音模型合成为一个声音模型(多个模型参数的凸组合或线性组合),从而制造出现实中不存在的声线
|
369 |
-
注意:
|
370 |
-
1.该功能仅支持单说话人的模型
|
371 |
-
2.如果强行使用多说话人模型,需要保证多个模型的说话人数量相同,这样可以混合同一个SpaekerID下的声音
|
372 |
-
3.保证所有待混合模型的config.json中的model字段是相同的
|
373 |
-
4.输出的混合模型可以使用待合成模型的任意一个config.json,但聚类模型将不能使用
|
374 |
-
5.批量上传模型��时候最好把模型放到一个文件夹选中后一起上传
|
375 |
-
6.混合比例调整建议大小在0-100之间,也可以调为其他数字,但在线性组合模式下会出现未知的效果
|
376 |
-
7.混合完毕后,文件将会保存在项目根目录中,文件名为output.pth
|
377 |
-
8.凸组合模式会将混合比例执行Softmax使混合比例相加为1,而线性组合模式不会
|
378 |
-
</font>
|
379 |
-
""")
|
380 |
-
mix_model_path = gr.Files(label="选择需要混合模型文件")
|
381 |
-
mix_model_upload_button = gr.UploadButton("选择/追加需要混合模型文件", file_count="multiple")
|
382 |
-
mix_model_output1 = gr.Textbox(
|
383 |
-
label="混合比例调整,单位/%",
|
384 |
-
interactive = True
|
385 |
-
)
|
386 |
-
mix_mode = gr.Radio(choices=["凸组合", "线性组合"], label="融合模式",value="凸组合",interactive = True)
|
387 |
-
mix_submit = gr.Button("声线融合启动", variant="primary")
|
388 |
-
mix_model_output2 = gr.Textbox(
|
389 |
-
label="Output Message"
|
390 |
-
)
|
391 |
-
mix_model_path.change(updata_mix_info,[mix_model_path],[mix_model_output1])
|
392 |
-
mix_model_upload_button.upload(upload_mix_append_file, [mix_model_upload_button,mix_model_path], [mix_model_path,mix_model_output1])
|
393 |
-
mix_submit.click(mix_submit_click, [mix_model_output1,mix_mode], [mix_model_output2])
|
394 |
-
|
395 |
-
with gr.TabItem("模型压缩工具"):
|
396 |
-
gr.Markdown(value="""
|
397 |
-
该工具可以实现对模型的体积压缩,在**不影响模型推理功能**的情况下,将原本约600M的So-VITS模型压缩至约200M, 大大减少了硬盘的压力。
|
398 |
-
**注意:压缩后的模型将无法继续训练,请在确认封炉后再压缩。**
|
399 |
-
""")
|
400 |
-
model_to_compress = gr.File(label="模型上传")
|
401 |
-
compress_model_btn = gr.Button("压缩模型", variant="primary")
|
402 |
-
compress_model_output = gr.Textbox(label="输出信息", value="")
|
403 |
-
|
404 |
-
compress_model_btn.click(model_compression, [model_to_compress], [compress_model_output])
|
405 |
-
|
406 |
-
|
407 |
-
with gr.Tabs():
|
408 |
-
with gr.Row(variant="panel"):
|
409 |
-
with gr.Column():
|
410 |
-
gr.Markdown(value="""
|
411 |
-
<font size=2> WebUI设置</font>
|
412 |
-
""")
|
413 |
-
debug_button = gr.Checkbox(label="Debug模式,如果向社区反馈BUG需要打开,打开后控制台可以显示具体错误提示", value=debug)
|
414 |
-
# refresh local model list
|
415 |
-
local_model_refresh_btn.click(local_model_refresh_fn, outputs=local_model_selection)
|
416 |
-
# set local enabled/disabled on tab switch
|
417 |
-
local_model_tab_upload.select(lambda: False, outputs=local_model_enabled)
|
418 |
-
local_model_tab_local.select(lambda: True, outputs=local_model_enabled)
|
419 |
-
|
420 |
-
vc_submit.click(vc_fn, [sid, vc_input3, output_format, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2])
|
421 |
-
vc_submit2.click(vc_fn2, [text2tts, tts_lang, tts_gender, tts_rate, tts_volume, sid, output_format, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2])
|
422 |
-
|
423 |
-
debug_button.change(debug_change,[],[])
|
424 |
-
model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance,diff_model_path,diff_config_path,only_diffusion,use_spk_mix,local_model_enabled,local_model_selection],[sid,sid_output])
|
425 |
-
model_unload_button.click(modelUnload,[],[sid,sid_output])
|
426 |
-
os.system("start http://127.0.0.1:7860")
|
427 |
-
app.launch()
|
428 |
-
|
429 |
-
|
430 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|