None1145 commited on
Commit
b21e8bb
·
1 Parent(s): 20843bd
app.py CHANGED
@@ -21,7 +21,8 @@ def list_files_tree(directory, indent=""):
21
 
22
  from huggingface_hub import snapshot_download
23
  print("Models...")
24
- models_id = """None1145/So-VITS-SVC-Vulpisfoglia"""
 
25
  for model_id in models_id.split("\n"):
26
  if model_id in ["", " "]:
27
  break
@@ -95,12 +96,12 @@ def vc_fn(input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scal
95
  app = gr.Blocks()
96
  with app:
97
  with gr.Tabs():
98
- with gr.TabItem("Model"):
99
  speaker = gr.Dropdown(label="讲话人", choices=speakers, value=speakers[0])
100
  model_submit = gr.Button("加载模型", variant="primary")
101
  model_output1 = gr.Textbox(label="Output Message")
102
- model_submit.click(load, [speaker], [model_output1])
103
- with gr.TabItem("Basic"):
104
  # sid = gr.Dropdown(label="音色", choices=speakers, value=speakers[0])
105
  vc_input3 = gr.Audio(label="上传音频")
106
  vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
@@ -111,5 +112,5 @@ with app:
111
  vc_submit = gr.Button("转换", variant="primary")
112
  vc_output1 = gr.Textbox(label="Output Message")
113
  vc_output2 = gr.Audio(label="Output Audio")
114
- vc_submit.click(vc_fn, [vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale], [vc_output1, vc_output2])
115
  app.launch()
 
21
 
22
  from huggingface_hub import snapshot_download
23
  print("Models...")
24
+ models_id = """None1145/So-VITS-SVC-Lappland
25
+ None1145/So-VITS-SVC-Lappland-the-Decadenza"""
26
  for model_id in models_id.split("\n"):
27
  if model_id in ["", " "]:
28
  break
 
96
  app = gr.Blocks()
97
  with app:
98
  with gr.Tabs():
99
+ with gr.Tabs():
100
  speaker = gr.Dropdown(label="讲话人", choices=speakers, value=speakers[0])
101
  model_submit = gr.Button("加载模型", variant="primary")
102
  model_output1 = gr.Textbox(label="Output Message")
103
+ model_submit.click(load, [speaker], [model_output1])
104
+ with gr.Tabs():
105
  # sid = gr.Dropdown(label="音色", choices=speakers, value=speakers[0])
106
  vc_input3 = gr.Audio(label="上传音频")
107
  vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
 
112
  vc_submit = gr.Button("转换", variant="primary")
113
  vc_output1 = gr.Textbox(label="Output Message")
114
  vc_output2 = gr.Audio(label="Output Audio")
115
+ vc_submit.click(vc_fn, [vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale], [vc_output1, vc_output2])
116
  app.launch()
export_index_for_onnx.py DELETED
@@ -1,20 +0,0 @@
1
- import os
2
- import pickle
3
-
4
- import faiss
5
-
6
- path = "crs"
7
- indexs_file_path = f"checkpoints/{path}/feature_and_index.pkl"
8
- indexs_out_dir = f"checkpoints/{path}/"
9
-
10
- with open("feature_and_index.pkl",mode="rb") as f:
11
- indexs = pickle.load(f)
12
-
13
- for k in indexs:
14
- print(f"Save {k} index")
15
- faiss.write_index(
16
- indexs[k],
17
- os.path.join(indexs_out_dir,f"Index-{k}.index")
18
- )
19
-
20
- print("Saved all index")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
flask_api.py DELETED
@@ -1,60 +0,0 @@
1
- import io
2
- import logging
3
-
4
- import soundfile
5
- import torch
6
- import torchaudio
7
- from flask import Flask, request, send_file
8
- from flask_cors import CORS
9
-
10
- from inference.infer_tool import RealTimeVC, Svc
11
-
12
- app = Flask(__name__)
13
-
14
- CORS(app)
15
-
16
- logging.getLogger('numba').setLevel(logging.WARNING)
17
-
18
-
19
- @app.route("/voiceChangeModel", methods=["POST"])
20
- def voice_change_model():
21
- request_form = request.form
22
- wave_file = request.files.get("sample", None)
23
- # 变调信息
24
- f_pitch_change = float(request_form.get("fPitchChange", 0))
25
- # DAW所需的采样率
26
- daw_sample = int(float(request_form.get("sampleRate", 0)))
27
- speaker_id = int(float(request_form.get("sSpeakId", 0)))
28
- # http获得wav文件并转换
29
- input_wav_path = io.BytesIO(wave_file.read())
30
-
31
- # 模型推理
32
- if raw_infer:
33
- # out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
34
- out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path, cluster_infer_ratio=0,
35
- auto_predict_f0=False, noice_scale=0.4, f0_filter=False)
36
- tar_audio = torchaudio.functional.resample(out_audio, svc_model.target_sample, daw_sample)
37
- else:
38
- out_audio = svc.process(svc_model, speaker_id, f_pitch_change, input_wav_path, cluster_infer_ratio=0,
39
- auto_predict_f0=False, noice_scale=0.4, f0_filter=False)
40
- tar_audio = torchaudio.functional.resample(torch.from_numpy(out_audio), svc_model.target_sample, daw_sample)
41
- # 返回音频
42
- out_wav_path = io.BytesIO()
43
- soundfile.write(out_wav_path, tar_audio.cpu().numpy(), daw_sample, format="wav")
44
- out_wav_path.seek(0)
45
- return send_file(out_wav_path, download_name="temp.wav", as_attachment=True)
46
-
47
-
48
- if __name__ == '__main__':
49
- # 启用则为直接切片合成,False为交叉淡化方式
50
- # vst插件调整0.3-0.5s切片时间可以降低延迟,直接切片方法会有连接处爆音、交叉淡化会有轻微重叠声音
51
- # 自行选择能接受的方法,或将vst最大切片时间调整为1s,此处设为Ture,延迟大音质稳定一些
52
- raw_infer = True
53
- # 每个模型和config是唯一对应的
54
- model_name = "logs/32k/G_174000-Copy1.pth"
55
- config_name = "configs/config.json"
56
- cluster_model_path = "logs/44k/kmeans_10000.pt"
57
- svc_model = Svc(model_name, config_name, cluster_model_path=cluster_model_path)
58
- svc = RealTimeVC()
59
- # 此处与vst插件对应,不建议更改
60
- app.run(port=6842, host="0.0.0.0", debug=False, threaded=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
flask_api_full_song.py DELETED
@@ -1,55 +0,0 @@
1
- import io
2
-
3
- import numpy as np
4
- import soundfile
5
- from flask import Flask, request, send_file
6
-
7
- from inference import infer_tool, slicer
8
-
9
- app = Flask(__name__)
10
-
11
-
12
- @app.route("/wav2wav", methods=["POST"])
13
- def wav2wav():
14
- request_form = request.form
15
- audio_path = request_form.get("audio_path", None) # wav文件地址
16
- tran = int(float(request_form.get("tran", 0))) # 音调
17
- spk = request_form.get("spk", 0) # 说话人(id或者name都可以,具体看你的config)
18
- wav_format = request_form.get("wav_format", 'wav') # 范围文件格式
19
- infer_tool.format_wav(audio_path)
20
- chunks = slicer.cut(audio_path, db_thresh=-40)
21
- audio_data, audio_sr = slicer.chunks2audio(audio_path, chunks)
22
-
23
- audio = []
24
- for (slice_tag, data) in audio_data:
25
- print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
26
-
27
- length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
28
- if slice_tag:
29
- print('jump empty segment')
30
- _audio = np.zeros(length)
31
- else:
32
- # padd
33
- pad_len = int(audio_sr * 0.5)
34
- data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])])
35
- raw_path = io.BytesIO()
36
- soundfile.write(raw_path, data, audio_sr, format="wav")
37
- raw_path.seek(0)
38
- out_audio, out_sr = svc_model.infer(spk, tran, raw_path)
39
- svc_model.clear_empty()
40
- _audio = out_audio.cpu().numpy()
41
- pad_len = int(svc_model.target_sample * 0.5)
42
- _audio = _audio[pad_len:-pad_len]
43
-
44
- audio.extend(list(infer_tool.pad_array(_audio, length)))
45
- out_wav_path = io.BytesIO()
46
- soundfile.write(out_wav_path, audio, svc_model.target_sample, format=wav_format)
47
- out_wav_path.seek(0)
48
- return send_file(out_wav_path, download_name=f"temp.{wav_format}", as_attachment=True)
49
-
50
-
51
- if __name__ == '__main__':
52
- model_name = "logs/44k/G_60000.pth" # 模型地址
53
- config_name = "configs/config.json" # config地址
54
- svc_model = infer_tool.Svc(model_name, config_name)
55
- app.run(port=1145, host="0.0.0.0", debug=False, threaded=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/44k/1 DELETED
File without changes
onnx_export.py DELETED
@@ -1,144 +0,0 @@
1
- import argparse
2
- import json
3
-
4
- import torch
5
-
6
- import utils
7
- from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
8
-
9
- parser = argparse.ArgumentParser(description='SoVitsSvc OnnxExport')
10
-
11
- def OnnxExport(path=None):
12
- device = torch.device("cpu")
13
- hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
14
- SVCVITS = SynthesizerTrn(
15
- hps.data.filter_length // 2 + 1,
16
- hps.train.segment_size // hps.data.hop_length,
17
- **hps.model)
18
- _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
19
- _ = SVCVITS.eval().to(device)
20
- for i in SVCVITS.parameters():
21
- i.requires_grad = False
22
-
23
- num_frames = 200
24
-
25
- test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels)
26
- test_pitch = torch.rand(1, num_frames)
27
- test_vol = torch.rand(1, num_frames)
28
- test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0)
29
- test_uv = torch.ones(1, num_frames, dtype=torch.float32)
30
- test_noise = torch.randn(1, 192, num_frames)
31
- test_sid = torch.LongTensor([0])
32
- export_mix = True
33
- if len(hps.spk) < 2:
34
- export_mix = False
35
-
36
- if export_mix:
37
- spk_mix = []
38
- n_spk = len(hps.spk)
39
- for i in range(n_spk):
40
- spk_mix.append(1.0/float(n_spk))
41
- test_sid = torch.tensor(spk_mix)
42
- SVCVITS.export_chara_mix(hps.spk)
43
- test_sid = test_sid.unsqueeze(0)
44
- test_sid = test_sid.repeat(num_frames, 1)
45
-
46
- SVCVITS.eval()
47
-
48
- if export_mix:
49
- daxes = {
50
- "c": [0, 1],
51
- "f0": [1],
52
- "mel2ph": [1],
53
- "uv": [1],
54
- "noise": [2],
55
- "sid":[0]
56
- }
57
- else:
58
- daxes = {
59
- "c": [0, 1],
60
- "f0": [1],
61
- "mel2ph": [1],
62
- "uv": [1],
63
- "noise": [2]
64
- }
65
-
66
- input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
67
- output_names = ["audio", ]
68
-
69
- if SVCVITS.vol_embedding:
70
- input_names.append("vol")
71
- vol_dadict = {"vol" : [1]}
72
- daxes.update(vol_dadict)
73
- test_inputs = (
74
- test_hidden_unit.to(device),
75
- test_pitch.to(device),
76
- test_mel2ph.to(device),
77
- test_uv.to(device),
78
- test_noise.to(device),
79
- test_sid.to(device),
80
- test_vol.to(device)
81
- )
82
- else:
83
- test_inputs = (
84
- test_hidden_unit.to(device),
85
- test_pitch.to(device),
86
- test_mel2ph.to(device),
87
- test_uv.to(device),
88
- test_noise.to(device),
89
- test_sid.to(device)
90
- )
91
-
92
- # SVCVITS = torch.jit.script(SVCVITS)
93
- SVCVITS(test_hidden_unit.to(device),
94
- test_pitch.to(device),
95
- test_mel2ph.to(device),
96
- test_uv.to(device),
97
- test_noise.to(device),
98
- test_sid.to(device),
99
- test_vol.to(device))
100
-
101
- SVCVITS.dec.OnnxExport()
102
-
103
- torch.onnx.export(
104
- SVCVITS,
105
- test_inputs,
106
- f"checkpoints/{path}/{path}_SoVits.onnx",
107
- dynamic_axes=daxes,
108
- do_constant_folding=False,
109
- opset_version=16,
110
- verbose=False,
111
- input_names=input_names,
112
- output_names=output_names
113
- )
114
-
115
- vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9"
116
- spklist = []
117
- for key in hps.spk.keys():
118
- spklist.append(key)
119
-
120
- MoeVSConf = {
121
- "Folder" : f"{path}",
122
- "Name" : f"{path}",
123
- "Type" : "SoVits",
124
- "Rate" : hps.data.sampling_rate,
125
- "Hop" : hps.data.hop_length,
126
- "Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}",
127
- "SoVits4": True,
128
- "SoVits3": False,
129
- "CharaMix": export_mix,
130
- "Volume": SVCVITS.vol_embedding,
131
- "HiddenSize": SVCVITS.gin_channels,
132
- "Characters": spklist,
133
- "Cluster": ""
134
- }
135
-
136
- with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
137
- json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
138
-
139
-
140
- if __name__ == '__main__':
141
- parser.add_argument('-n', '--model_name', type=str, default="TransformerFlow", help='模型文件夹名(根目录下新建ckeckpoints文件夹,在此文件夹下建立一个新的文件夹,放置模型,该文件夹名即为此项)')
142
- args = parser.parse_args()
143
- path = args.model_name
144
- OnnxExport(path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
onnx_export_old.py DELETED
@@ -1,56 +0,0 @@
1
- import torch
2
-
3
- import utils
4
- from onnxexport.model_onnx import SynthesizerTrn
5
-
6
-
7
- def main(NetExport):
8
- path = "SoVits4.0"
9
- if NetExport:
10
- device = torch.device("cpu")
11
- hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
12
- SVCVITS = SynthesizerTrn(
13
- hps.data.filter_length // 2 + 1,
14
- hps.train.segment_size // hps.data.hop_length,
15
- **hps.model)
16
- _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
17
- _ = SVCVITS.eval().to(device)
18
- for i in SVCVITS.parameters():
19
- i.requires_grad = False
20
-
21
- n_frame = 10
22
- test_hidden_unit = torch.rand(1, n_frame, 256)
23
- test_pitch = torch.rand(1, n_frame)
24
- test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
25
- test_uv = torch.ones(1, n_frame, dtype=torch.float32)
26
- test_noise = torch.randn(1, 192, n_frame)
27
- test_sid = torch.LongTensor([0])
28
- input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
29
- output_names = ["audio", ]
30
-
31
- torch.onnx.export(SVCVITS,
32
- (
33
- test_hidden_unit.to(device),
34
- test_pitch.to(device),
35
- test_mel2ph.to(device),
36
- test_uv.to(device),
37
- test_noise.to(device),
38
- test_sid.to(device)
39
- ),
40
- f"checkpoints/{path}/model.onnx",
41
- dynamic_axes={
42
- "c": [0, 1],
43
- "f0": [1],
44
- "mel2ph": [1],
45
- "uv": [1],
46
- "noise": [2],
47
- },
48
- do_constant_folding=False,
49
- opset_version=16,
50
- verbose=False,
51
- input_names=input_names,
52
- output_names=output_names)
53
-
54
-
55
- if __name__ == '__main__':
56
- main(True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
onnxexport/model_onnx.py DELETED
@@ -1,333 +0,0 @@
1
- import torch
2
- from torch import nn
3
- from torch.nn import Conv1d, Conv2d
4
- from torch.nn import functional as F
5
- from torch.nn.utils import spectral_norm, weight_norm
6
-
7
- import modules.attentions as attentions
8
- import modules.commons as commons
9
- import modules.modules as modules
10
- import utils
11
- from modules.commons import get_padding
12
- from utils import f0_to_coarse
13
- from vdecoder.hifigan.models import Generator
14
-
15
-
16
- class ResidualCouplingBlock(nn.Module):
17
- def __init__(self,
18
- channels,
19
- hidden_channels,
20
- kernel_size,
21
- dilation_rate,
22
- n_layers,
23
- n_flows=4,
24
- gin_channels=0):
25
- super().__init__()
26
- self.channels = channels
27
- self.hidden_channels = hidden_channels
28
- self.kernel_size = kernel_size
29
- self.dilation_rate = dilation_rate
30
- self.n_layers = n_layers
31
- self.n_flows = n_flows
32
- self.gin_channels = gin_channels
33
-
34
- self.flows = nn.ModuleList()
35
- for i in range(n_flows):
36
- self.flows.append(
37
- modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
38
- gin_channels=gin_channels, mean_only=True))
39
- self.flows.append(modules.Flip())
40
-
41
- def forward(self, x, x_mask, g=None, reverse=False):
42
- if not reverse:
43
- for flow in self.flows:
44
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
45
- else:
46
- for flow in reversed(self.flows):
47
- x = flow(x, x_mask, g=g, reverse=reverse)
48
- return x
49
-
50
-
51
- class Encoder(nn.Module):
52
- def __init__(self,
53
- in_channels,
54
- out_channels,
55
- hidden_channels,
56
- kernel_size,
57
- dilation_rate,
58
- n_layers,
59
- gin_channels=0):
60
- super().__init__()
61
- self.in_channels = in_channels
62
- self.out_channels = out_channels
63
- self.hidden_channels = hidden_channels
64
- self.kernel_size = kernel_size
65
- self.dilation_rate = dilation_rate
66
- self.n_layers = n_layers
67
- self.gin_channels = gin_channels
68
-
69
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
70
- self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
71
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
72
-
73
- def forward(self, x, x_lengths, g=None):
74
- # print(x.shape,x_lengths.shape)
75
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
76
- x = self.pre(x) * x_mask
77
- x = self.enc(x, x_mask, g=g)
78
- stats = self.proj(x) * x_mask
79
- m, logs = torch.split(stats, self.out_channels, dim=1)
80
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
81
- return z, m, logs, x_mask
82
-
83
-
84
- class TextEncoder(nn.Module):
85
- def __init__(self,
86
- out_channels,
87
- hidden_channels,
88
- kernel_size,
89
- n_layers,
90
- gin_channels=0,
91
- filter_channels=None,
92
- n_heads=None,
93
- p_dropout=None):
94
- super().__init__()
95
- self.out_channels = out_channels
96
- self.hidden_channels = hidden_channels
97
- self.kernel_size = kernel_size
98
- self.n_layers = n_layers
99
- self.gin_channels = gin_channels
100
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
101
- self.f0_emb = nn.Embedding(256, hidden_channels)
102
-
103
- self.enc_ = attentions.Encoder(
104
- hidden_channels,
105
- filter_channels,
106
- n_heads,
107
- n_layers,
108
- kernel_size,
109
- p_dropout)
110
-
111
- def forward(self, x, x_mask, f0=None, z=None):
112
- x = x + self.f0_emb(f0).transpose(1, 2)
113
- x = self.enc_(x * x_mask, x_mask)
114
- stats = self.proj(x) * x_mask
115
- m, logs = torch.split(stats, self.out_channels, dim=1)
116
- z = (m + z * torch.exp(logs)) * x_mask
117
- return z, m, logs, x_mask
118
-
119
-
120
- class DiscriminatorP(torch.nn.Module):
121
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
122
- super(DiscriminatorP, self).__init__()
123
- self.period = period
124
- self.use_spectral_norm = use_spectral_norm
125
- norm_f = weight_norm if use_spectral_norm is False else spectral_norm
126
- self.convs = nn.ModuleList([
127
- norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
128
- norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
129
- norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
130
- norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
131
- norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
132
- ])
133
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
134
-
135
- def forward(self, x):
136
- fmap = []
137
-
138
- # 1d to 2d
139
- b, c, t = x.shape
140
- if t % self.period != 0: # pad first
141
- n_pad = self.period - (t % self.period)
142
- x = F.pad(x, (0, n_pad), "reflect")
143
- t = t + n_pad
144
- x = x.view(b, c, t // self.period, self.period)
145
-
146
- for l in self.convs:
147
- x = l(x)
148
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
149
- fmap.append(x)
150
- x = self.conv_post(x)
151
- fmap.append(x)
152
- x = torch.flatten(x, 1, -1)
153
-
154
- return x, fmap
155
-
156
-
157
- class DiscriminatorS(torch.nn.Module):
158
- def __init__(self, use_spectral_norm=False):
159
- super(DiscriminatorS, self).__init__()
160
- norm_f = weight_norm if use_spectral_norm is False else spectral_norm
161
- self.convs = nn.ModuleList([
162
- norm_f(Conv1d(1, 16, 15, 1, padding=7)),
163
- norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
164
- norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
165
- norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
166
- norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
167
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
168
- ])
169
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
170
-
171
- def forward(self, x):
172
- fmap = []
173
-
174
- for l in self.convs:
175
- x = l(x)
176
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
177
- fmap.append(x)
178
- x = self.conv_post(x)
179
- fmap.append(x)
180
- x = torch.flatten(x, 1, -1)
181
-
182
- return x, fmap
183
-
184
-
185
- class F0Decoder(nn.Module):
186
- def __init__(self,
187
- out_channels,
188
- hidden_channels,
189
- filter_channels,
190
- n_heads,
191
- n_layers,
192
- kernel_size,
193
- p_dropout,
194
- spk_channels=0):
195
- super().__init__()
196
- self.out_channels = out_channels
197
- self.hidden_channels = hidden_channels
198
- self.filter_channels = filter_channels
199
- self.n_heads = n_heads
200
- self.n_layers = n_layers
201
- self.kernel_size = kernel_size
202
- self.p_dropout = p_dropout
203
- self.spk_channels = spk_channels
204
-
205
- self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
206
- self.decoder = attentions.FFT(
207
- hidden_channels,
208
- filter_channels,
209
- n_heads,
210
- n_layers,
211
- kernel_size,
212
- p_dropout)
213
- self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
214
- self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
215
- self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
216
-
217
- def forward(self, x, norm_f0, x_mask, spk_emb=None):
218
- x = torch.detach(x)
219
- if spk_emb is not None:
220
- x = x + self.cond(spk_emb)
221
- x += self.f0_prenet(norm_f0)
222
- x = self.prenet(x) * x_mask
223
- x = self.decoder(x * x_mask, x_mask)
224
- x = self.proj(x) * x_mask
225
- return x
226
-
227
-
228
- class SynthesizerTrn(nn.Module):
229
- """
230
- Synthesizer for Training
231
- """
232
-
233
- def __init__(self,
234
- spec_channels,
235
- segment_size,
236
- inter_channels,
237
- hidden_channels,
238
- filter_channels,
239
- n_heads,
240
- n_layers,
241
- kernel_size,
242
- p_dropout,
243
- resblock,
244
- resblock_kernel_sizes,
245
- resblock_dilation_sizes,
246
- upsample_rates,
247
- upsample_initial_channel,
248
- upsample_kernel_sizes,
249
- gin_channels,
250
- ssl_dim,
251
- n_speakers,
252
- sampling_rate=44100,
253
- **kwargs):
254
- super().__init__()
255
- self.spec_channels = spec_channels
256
- self.inter_channels = inter_channels
257
- self.hidden_channels = hidden_channels
258
- self.filter_channels = filter_channels
259
- self.n_heads = n_heads
260
- self.n_layers = n_layers
261
- self.kernel_size = kernel_size
262
- self.p_dropout = p_dropout
263
- self.resblock = resblock
264
- self.resblock_kernel_sizes = resblock_kernel_sizes
265
- self.resblock_dilation_sizes = resblock_dilation_sizes
266
- self.upsample_rates = upsample_rates
267
- self.upsample_initial_channel = upsample_initial_channel
268
- self.upsample_kernel_sizes = upsample_kernel_sizes
269
- self.segment_size = segment_size
270
- self.gin_channels = gin_channels
271
- self.ssl_dim = ssl_dim
272
- self.emb_g = nn.Embedding(n_speakers, gin_channels)
273
-
274
- self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
275
-
276
- self.enc_p = TextEncoder(
277
- inter_channels,
278
- hidden_channels,
279
- filter_channels=filter_channels,
280
- n_heads=n_heads,
281
- n_layers=n_layers,
282
- kernel_size=kernel_size,
283
- p_dropout=p_dropout
284
- )
285
- hps = {
286
- "sampling_rate": sampling_rate,
287
- "inter_channels": inter_channels,
288
- "resblock": resblock,
289
- "resblock_kernel_sizes": resblock_kernel_sizes,
290
- "resblock_dilation_sizes": resblock_dilation_sizes,
291
- "upsample_rates": upsample_rates,
292
- "upsample_initial_channel": upsample_initial_channel,
293
- "upsample_kernel_sizes": upsample_kernel_sizes,
294
- "gin_channels": gin_channels,
295
- }
296
- self.dec = Generator(h=hps)
297
- self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
298
- self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
299
- self.f0_decoder = F0Decoder(
300
- 1,
301
- hidden_channels,
302
- filter_channels,
303
- n_heads,
304
- n_layers,
305
- kernel_size,
306
- p_dropout,
307
- spk_channels=gin_channels
308
- )
309
- self.emb_uv = nn.Embedding(2, hidden_channels)
310
- self.predict_f0 = False
311
-
312
- def forward(self, c, f0, mel2ph, uv, noise=None, g=None):
313
-
314
- decoder_inp = F.pad(c, [0, 0, 1, 0])
315
- mel2ph_ = mel2ph.unsqueeze(2).repeat([1, 1, c.shape[-1]])
316
- c = torch.gather(decoder_inp, 1, mel2ph_).transpose(1, 2) # [B, T, H]
317
-
318
- c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
319
- g = g.unsqueeze(0)
320
- g = self.emb_g(g).transpose(1, 2)
321
- x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
322
- x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
323
-
324
- if self.predict_f0:
325
- lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
326
- norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
327
- pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
328
- f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
329
-
330
- z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), z=noise)
331
- z = self.flow(z_p, c_mask, g=g, reverse=True)
332
- o = self.dec(z * c_mask, g=g, f0=f0)
333
- return o
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
onnxexport/model_onnx_speaker_mix.py DELETED
@@ -1,366 +0,0 @@
1
- import torch
2
- from torch import nn
3
- from torch.nn import functional as F
4
-
5
- import modules.attentions as attentions
6
- import modules.commons as commons
7
- import modules.modules as modules
8
- import utils
9
- from utils import f0_to_coarse
10
-
11
-
12
- class ResidualCouplingBlock(nn.Module):
13
- def __init__(self,
14
- channels,
15
- hidden_channels,
16
- kernel_size,
17
- dilation_rate,
18
- n_layers,
19
- n_flows=4,
20
- gin_channels=0,
21
- share_parameter=False
22
- ):
23
- super().__init__()
24
- self.channels = channels
25
- self.hidden_channels = hidden_channels
26
- self.kernel_size = kernel_size
27
- self.dilation_rate = dilation_rate
28
- self.n_layers = n_layers
29
- self.n_flows = n_flows
30
- self.gin_channels = gin_channels
31
-
32
- self.flows = nn.ModuleList()
33
-
34
- self.wn = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=gin_channels) if share_parameter else None
35
-
36
- for i in range(n_flows):
37
- self.flows.append(
38
- modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
39
- gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn))
40
- self.flows.append(modules.Flip())
41
-
42
- def forward(self, x, x_mask, g=None, reverse=False):
43
- if not reverse:
44
- for flow in self.flows:
45
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
46
- else:
47
- for flow in reversed(self.flows):
48
- x = flow(x, x_mask, g=g, reverse=reverse)
49
- return x
50
-
51
- class TransformerCouplingBlock(nn.Module):
52
- def __init__(self,
53
- channels,
54
- hidden_channels,
55
- filter_channels,
56
- n_heads,
57
- n_layers,
58
- kernel_size,
59
- p_dropout,
60
- n_flows=4,
61
- gin_channels=0,
62
- share_parameter=False
63
- ):
64
-
65
- super().__init__()
66
- self.channels = channels
67
- self.hidden_channels = hidden_channels
68
- self.kernel_size = kernel_size
69
- self.n_layers = n_layers
70
- self.n_flows = n_flows
71
- self.gin_channels = gin_channels
72
-
73
- self.flows = nn.ModuleList()
74
-
75
- self.wn = attentions.FFT(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, isflow = True, gin_channels = self.gin_channels) if share_parameter else None
76
-
77
- for i in range(n_flows):
78
- self.flows.append(
79
- modules.TransformerCouplingLayer(channels, hidden_channels, kernel_size, n_layers, n_heads, p_dropout, filter_channels, mean_only=True, wn_sharing_parameter=self.wn, gin_channels = self.gin_channels))
80
- self.flows.append(modules.Flip())
81
-
82
- def forward(self, x, x_mask, g=None, reverse=False):
83
- if not reverse:
84
- for flow in self.flows:
85
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
86
- else:
87
- for flow in reversed(self.flows):
88
- x = flow(x, x_mask, g=g, reverse=reverse)
89
- return x
90
-
91
-
92
- class Encoder(nn.Module):
93
- def __init__(self,
94
- in_channels,
95
- out_channels,
96
- hidden_channels,
97
- kernel_size,
98
- dilation_rate,
99
- n_layers,
100
- gin_channels=0):
101
- super().__init__()
102
- self.in_channels = in_channels
103
- self.out_channels = out_channels
104
- self.hidden_channels = hidden_channels
105
- self.kernel_size = kernel_size
106
- self.dilation_rate = dilation_rate
107
- self.n_layers = n_layers
108
- self.gin_channels = gin_channels
109
-
110
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
111
- self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
112
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
113
-
114
- def forward(self, x, x_lengths, g=None):
115
- # print(x.shape,x_lengths.shape)
116
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
117
- x = self.pre(x) * x_mask
118
- x = self.enc(x, x_mask, g=g)
119
- stats = self.proj(x) * x_mask
120
- m, logs = torch.split(stats, self.out_channels, dim=1)
121
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
122
- return z, m, logs, x_mask
123
-
124
-
125
- class TextEncoder(nn.Module):
126
- def __init__(self,
127
- out_channels,
128
- hidden_channels,
129
- kernel_size,
130
- n_layers,
131
- gin_channels=0,
132
- filter_channels=None,
133
- n_heads=None,
134
- p_dropout=None):
135
- super().__init__()
136
- self.out_channels = out_channels
137
- self.hidden_channels = hidden_channels
138
- self.kernel_size = kernel_size
139
- self.n_layers = n_layers
140
- self.gin_channels = gin_channels
141
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
142
- self.f0_emb = nn.Embedding(256, hidden_channels)
143
-
144
- self.enc_ = attentions.Encoder(
145
- hidden_channels,
146
- filter_channels,
147
- n_heads,
148
- n_layers,
149
- kernel_size,
150
- p_dropout)
151
-
152
- def forward(self, x, x_mask, f0=None, z=None):
153
- x = x + self.f0_emb(f0).transpose(1, 2)
154
- x = self.enc_(x * x_mask, x_mask)
155
- stats = self.proj(x) * x_mask
156
- m, logs = torch.split(stats, self.out_channels, dim=1)
157
- z = (m + z * torch.exp(logs)) * x_mask
158
-
159
- return z, m, logs, x_mask
160
-
161
-
162
- class F0Decoder(nn.Module):
163
- def __init__(self,
164
- out_channels,
165
- hidden_channels,
166
- filter_channels,
167
- n_heads,
168
- n_layers,
169
- kernel_size,
170
- p_dropout,
171
- spk_channels=0):
172
- super().__init__()
173
- self.out_channels = out_channels
174
- self.hidden_channels = hidden_channels
175
- self.filter_channels = filter_channels
176
- self.n_heads = n_heads
177
- self.n_layers = n_layers
178
- self.kernel_size = kernel_size
179
- self.p_dropout = p_dropout
180
- self.spk_channels = spk_channels
181
-
182
- self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
183
- self.decoder = attentions.FFT(
184
- hidden_channels,
185
- filter_channels,
186
- n_heads,
187
- n_layers,
188
- kernel_size,
189
- p_dropout)
190
- self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
191
- self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
192
- self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
193
-
194
- def forward(self, x, norm_f0, x_mask, spk_emb=None):
195
- x = torch.detach(x)
196
- if (spk_emb is not None):
197
- x = x + self.cond(spk_emb)
198
- x += self.f0_prenet(norm_f0)
199
- x = self.prenet(x) * x_mask
200
- x = self.decoder(x * x_mask, x_mask)
201
- x = self.proj(x) * x_mask
202
- return x
203
-
204
-
205
- class SynthesizerTrn(nn.Module):
206
- """
207
- Synthesizer for Training
208
- """
209
-
210
- def __init__(self,
211
- spec_channels,
212
- segment_size,
213
- inter_channels,
214
- hidden_channels,
215
- filter_channels,
216
- n_heads,
217
- n_layers,
218
- kernel_size,
219
- p_dropout,
220
- resblock,
221
- resblock_kernel_sizes,
222
- resblock_dilation_sizes,
223
- upsample_rates,
224
- upsample_initial_channel,
225
- upsample_kernel_sizes,
226
- gin_channels,
227
- ssl_dim,
228
- n_speakers,
229
- sampling_rate=44100,
230
- vol_embedding=False,
231
- vocoder_name = "nsf-hifigan",
232
- use_depthwise_conv = False,
233
- use_automatic_f0_prediction = True,
234
- flow_share_parameter = False,
235
- n_flow_layer = 4,
236
- n_layers_trans_flow = 3,
237
- use_transformer_flow = False,
238
- **kwargs):
239
-
240
- super().__init__()
241
- self.spec_channels = spec_channels
242
- self.inter_channels = inter_channels
243
- self.hidden_channels = hidden_channels
244
- self.filter_channels = filter_channels
245
- self.n_heads = n_heads
246
- self.n_layers = n_layers
247
- self.kernel_size = kernel_size
248
- self.p_dropout = p_dropout
249
- self.resblock = resblock
250
- self.resblock_kernel_sizes = resblock_kernel_sizes
251
- self.resblock_dilation_sizes = resblock_dilation_sizes
252
- self.upsample_rates = upsample_rates
253
- self.upsample_initial_channel = upsample_initial_channel
254
- self.upsample_kernel_sizes = upsample_kernel_sizes
255
- self.segment_size = segment_size
256
- self.gin_channels = gin_channels
257
- self.ssl_dim = ssl_dim
258
- self.vol_embedding = vol_embedding
259
- self.emb_g = nn.Embedding(n_speakers, gin_channels)
260
- self.use_depthwise_conv = use_depthwise_conv
261
- self.use_automatic_f0_prediction = use_automatic_f0_prediction
262
- self.n_layers_trans_flow = n_layers_trans_flow
263
- if vol_embedding:
264
- self.emb_vol = nn.Linear(1, hidden_channels)
265
-
266
- self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
267
-
268
- self.enc_p = TextEncoder(
269
- inter_channels,
270
- hidden_channels,
271
- filter_channels=filter_channels,
272
- n_heads=n_heads,
273
- n_layers=n_layers,
274
- kernel_size=kernel_size,
275
- p_dropout=p_dropout
276
- )
277
- hps = {
278
- "sampling_rate": sampling_rate,
279
- "inter_channels": inter_channels,
280
- "resblock": resblock,
281
- "resblock_kernel_sizes": resblock_kernel_sizes,
282
- "resblock_dilation_sizes": resblock_dilation_sizes,
283
- "upsample_rates": upsample_rates,
284
- "upsample_initial_channel": upsample_initial_channel,
285
- "upsample_kernel_sizes": upsample_kernel_sizes,
286
- "gin_channels": gin_channels,
287
- "use_depthwise_conv":use_depthwise_conv
288
- }
289
-
290
- modules.set_Conv1dModel(self.use_depthwise_conv)
291
-
292
- if vocoder_name == "nsf-hifigan":
293
- from vdecoder.hifigan.models import Generator
294
- self.dec = Generator(h=hps)
295
- elif vocoder_name == "nsf-snake-hifigan":
296
- from vdecoder.hifiganwithsnake.models import Generator
297
- self.dec = Generator(h=hps)
298
- else:
299
- print("[?] Unkown vocoder: use default(nsf-hifigan)")
300
- from vdecoder.hifigan.models import Generator
301
- self.dec = Generator(h=hps)
302
-
303
- self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
304
- if use_transformer_flow:
305
- self.flow = TransformerCouplingBlock(inter_channels, hidden_channels, filter_channels, n_heads, n_layers_trans_flow, 5, p_dropout, n_flow_layer, gin_channels=gin_channels, share_parameter=flow_share_parameter)
306
- else:
307
- self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, gin_channels=gin_channels, share_parameter=flow_share_parameter)
308
- if self.use_automatic_f0_prediction:
309
- self.f0_decoder = F0Decoder(
310
- 1,
311
- hidden_channels,
312
- filter_channels,
313
- n_heads,
314
- n_layers,
315
- kernel_size,
316
- p_dropout,
317
- spk_channels=gin_channels
318
- )
319
- self.emb_uv = nn.Embedding(2, hidden_channels)
320
- self.predict_f0 = False
321
- self.speaker_map = []
322
- self.export_mix = False
323
-
324
- def export_chara_mix(self, speakers_mix):
325
- self.speaker_map = torch.zeros((len(speakers_mix), 1, 1, self.gin_channels))
326
- i = 0
327
- for key in speakers_mix.keys():
328
- spkidx = speakers_mix[key]
329
- self.speaker_map[i] = self.emb_g(torch.LongTensor([[spkidx]]))
330
- i = i + 1
331
- self.speaker_map = self.speaker_map.unsqueeze(0)
332
- self.export_mix = True
333
-
334
- def forward(self, c, f0, mel2ph, uv, noise=None, g=None, vol = None):
335
- decoder_inp = F.pad(c, [0, 0, 1, 0])
336
- mel2ph_ = mel2ph.unsqueeze(2).repeat([1, 1, c.shape[-1]])
337
- c = torch.gather(decoder_inp, 1, mel2ph_).transpose(1, 2) # [B, T, H]
338
-
339
- if self.export_mix: # [N, S] * [S, B, 1, H]
340
- g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
341
- g = g * self.speaker_map # [N, S, B, 1, H]
342
- g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
343
- g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
344
- else:
345
- if g.dim() == 1:
346
- g = g.unsqueeze(0)
347
- g = self.emb_g(g).transpose(1, 2)
348
-
349
- x_mask = torch.unsqueeze(torch.ones_like(f0), 1).to(c.dtype)
350
- # vol proj
351
-
352
- vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
353
-
354
- x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) + vol
355
-
356
- if self.use_automatic_f0_prediction and self.predict_f0:
357
- lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
358
- norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
359
- pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
360
- f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
361
-
362
- z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), z=noise)
363
- z = self.flow(z_p, c_mask, g=g, reverse=True)
364
- o = self.dec(z * c_mask, g=g, f0=f0)
365
- return o
366
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train.py DELETED
@@ -1,329 +0,0 @@
1
- import logging
2
- import multiprocessing
3
- import os
4
- import time
5
-
6
- import torch
7
- import torch.distributed as dist
8
- import torch.multiprocessing as mp
9
- from torch.cuda.amp import GradScaler, autocast
10
- from torch.nn import functional as F
11
- from torch.nn.parallel import DistributedDataParallel as DDP
12
- from torch.utils.data import DataLoader
13
- from torch.utils.tensorboard import SummaryWriter
14
-
15
- import modules.commons as commons
16
- import utils
17
- from data_utils import TextAudioCollate, TextAudioSpeakerLoader
18
- from models import (
19
- MultiPeriodDiscriminator,
20
- SynthesizerTrn,
21
- )
22
- from modules.losses import discriminator_loss, feature_loss, generator_loss, kl_loss
23
- from modules.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
24
-
25
- logging.getLogger('matplotlib').setLevel(logging.WARNING)
26
- logging.getLogger('numba').setLevel(logging.WARNING)
27
-
28
- torch.backends.cudnn.benchmark = True
29
- global_step = 0
30
- start_time = time.time()
31
-
32
- # os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO'
33
-
34
-
35
- def main():
36
- """Assume Single Node Multi GPUs Training Only"""
37
- assert torch.cuda.is_available(), "CPU training is not allowed."
38
- hps = utils.get_hparams()
39
-
40
- n_gpus = torch.cuda.device_count()
41
- os.environ['MASTER_ADDR'] = 'localhost'
42
- os.environ['MASTER_PORT'] = hps.train.port
43
-
44
- mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
45
-
46
-
47
- def run(rank, n_gpus, hps):
48
- global global_step
49
- if rank == 0:
50
- logger = utils.get_logger(hps.model_dir)
51
- logger.info(hps)
52
- utils.check_git_hash(hps.model_dir)
53
- writer = SummaryWriter(log_dir=hps.model_dir)
54
- writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
55
-
56
- # for pytorch on win, backend use gloo
57
- dist.init_process_group(backend= 'gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus, rank=rank)
58
- torch.manual_seed(hps.train.seed)
59
- torch.cuda.set_device(rank)
60
- collate_fn = TextAudioCollate()
61
- all_in_mem = hps.train.all_in_mem # If you have enough memory, turn on this option to avoid disk IO and speed up training.
62
- train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps, all_in_mem=all_in_mem)
63
- num_workers = 5 if multiprocessing.cpu_count() > 4 else multiprocessing.cpu_count()
64
- if all_in_mem:
65
- num_workers = 0
66
- train_loader = DataLoader(train_dataset, num_workers=num_workers, shuffle=False, pin_memory=True,
67
- batch_size=hps.train.batch_size, collate_fn=collate_fn)
68
- if rank == 0:
69
- eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps, all_in_mem=all_in_mem,vol_aug = False)
70
- eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False,
71
- batch_size=1, pin_memory=False,
72
- drop_last=False, collate_fn=collate_fn)
73
-
74
- net_g = SynthesizerTrn(
75
- hps.data.filter_length // 2 + 1,
76
- hps.train.segment_size // hps.data.hop_length,
77
- **hps.model).cuda(rank)
78
- net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
79
- optim_g = torch.optim.AdamW(
80
- net_g.parameters(),
81
- hps.train.learning_rate,
82
- betas=hps.train.betas,
83
- eps=hps.train.eps)
84
- optim_d = torch.optim.AdamW(
85
- net_d.parameters(),
86
- hps.train.learning_rate,
87
- betas=hps.train.betas,
88
- eps=hps.train.eps)
89
- net_g = DDP(net_g, device_ids=[rank]) # , find_unused_parameters=True)
90
- net_d = DDP(net_d, device_ids=[rank])
91
-
92
- skip_optimizer = False
93
- try:
94
- _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g,
95
- optim_g, skip_optimizer)
96
- _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d,
97
- optim_d, skip_optimizer)
98
- epoch_str = max(epoch_str, 1)
99
- name=utils.latest_checkpoint_path(hps.model_dir, "D_*.pth")
100
- global_step=int(name[name.rfind("_")+1:name.rfind(".")])+1
101
- #global_step = (epoch_str - 1) * len(train_loader)
102
- except Exception:
103
- print("load old checkpoint failed...")
104
- epoch_str = 1
105
- global_step = 0
106
- if skip_optimizer:
107
- epoch_str = 1
108
- global_step = 0
109
-
110
- warmup_epoch = hps.train.warmup_epochs
111
- scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
112
- scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
113
-
114
- scaler = GradScaler(enabled=hps.train.fp16_run)
115
-
116
- for epoch in range(epoch_str, hps.train.epochs + 1):
117
- # set up warm-up learning rate
118
- if epoch <= warmup_epoch:
119
- for param_group in optim_g.param_groups:
120
- param_group['lr'] = hps.train.learning_rate / warmup_epoch * epoch
121
- for param_group in optim_d.param_groups:
122
- param_group['lr'] = hps.train.learning_rate / warmup_epoch * epoch
123
- # training
124
- if rank == 0:
125
- train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler,
126
- [train_loader, eval_loader], logger, [writer, writer_eval])
127
- else:
128
- train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler,
129
- [train_loader, None], None, None)
130
- # update learning rate
131
- scheduler_g.step()
132
- scheduler_d.step()
133
-
134
-
135
- def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers):
136
- net_g, net_d = nets
137
- optim_g, optim_d = optims
138
- scheduler_g, scheduler_d = schedulers
139
- train_loader, eval_loader = loaders
140
- if writers is not None:
141
- writer, writer_eval = writers
142
-
143
- half_type = torch.bfloat16 if hps.train.half_type=="bf16" else torch.float16
144
-
145
- # train_loader.batch_sampler.set_epoch(epoch)
146
- global global_step
147
-
148
- net_g.train()
149
- net_d.train()
150
- for batch_idx, items in enumerate(train_loader):
151
- c, f0, spec, y, spk, lengths, uv,volume = items
152
- g = spk.cuda(rank, non_blocking=True)
153
- spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True)
154
- c = c.cuda(rank, non_blocking=True)
155
- f0 = f0.cuda(rank, non_blocking=True)
156
- uv = uv.cuda(rank, non_blocking=True)
157
- lengths = lengths.cuda(rank, non_blocking=True)
158
- mel = spec_to_mel_torch(
159
- spec,
160
- hps.data.filter_length,
161
- hps.data.n_mel_channels,
162
- hps.data.sampling_rate,
163
- hps.data.mel_fmin,
164
- hps.data.mel_fmax)
165
-
166
- with autocast(enabled=hps.train.fp16_run, dtype=half_type):
167
- y_hat, ids_slice, z_mask, \
168
- (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 = net_g(c, f0, uv, spec, g=g, c_lengths=lengths,
169
- spec_lengths=lengths,vol = volume)
170
-
171
- y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
172
- y_hat_mel = mel_spectrogram_torch(
173
- y_hat.squeeze(1),
174
- hps.data.filter_length,
175
- hps.data.n_mel_channels,
176
- hps.data.sampling_rate,
177
- hps.data.hop_length,
178
- hps.data.win_length,
179
- hps.data.mel_fmin,
180
- hps.data.mel_fmax
181
- )
182
- y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice
183
-
184
- # Discriminator
185
- y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
186
-
187
- with autocast(enabled=False, dtype=half_type):
188
- loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
189
- loss_disc_all = loss_disc
190
-
191
- optim_d.zero_grad()
192
- scaler.scale(loss_disc_all).backward()
193
- scaler.unscale_(optim_d)
194
- grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
195
- scaler.step(optim_d)
196
-
197
-
198
- with autocast(enabled=hps.train.fp16_run, dtype=half_type):
199
- # Generator
200
- y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
201
- with autocast(enabled=False, dtype=half_type):
202
- loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
203
- loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
204
- loss_fm = feature_loss(fmap_r, fmap_g)
205
- loss_gen, losses_gen = generator_loss(y_d_hat_g)
206
- loss_lf0 = F.mse_loss(pred_lf0, lf0) if net_g.module.use_automatic_f0_prediction else 0
207
- loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_lf0
208
- optim_g.zero_grad()
209
- scaler.scale(loss_gen_all).backward()
210
- scaler.unscale_(optim_g)
211
- grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
212
- scaler.step(optim_g)
213
- scaler.update()
214
-
215
- if rank == 0:
216
- if global_step % hps.train.log_interval == 0:
217
- lr = optim_g.param_groups[0]['lr']
218
- losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_kl]
219
- reference_loss=0
220
- for i in losses:
221
- reference_loss += i
222
- logger.info('Train Epoch: {} [{:.0f}%]'.format(
223
- epoch,
224
- 100. * batch_idx / len(train_loader)))
225
- logger.info(f"Losses: {[x.item() for x in losses]}, step: {global_step}, lr: {lr}, reference_loss: {reference_loss}")
226
-
227
- scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr,
228
- "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g}
229
- scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl,
230
- "loss/g/lf0": loss_lf0})
231
-
232
- # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
233
- # scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
234
- # scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
235
- image_dict = {
236
- "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
237
- "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
238
- "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy())
239
- }
240
-
241
- if net_g.module.use_automatic_f0_prediction:
242
- image_dict.update({
243
- "all/lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(),
244
- pred_lf0[0, 0, :].detach().cpu().numpy()),
245
- "all/norm_lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(),
246
- norm_lf0[0, 0, :].detach().cpu().numpy())
247
- })
248
-
249
- utils.summarize(
250
- writer=writer,
251
- global_step=global_step,
252
- images=image_dict,
253
- scalars=scalar_dict
254
- )
255
-
256
- if global_step % hps.train.eval_interval == 0:
257
- evaluate(hps, net_g, eval_loader, writer_eval)
258
- utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch,
259
- os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
260
- utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch,
261
- os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
262
- keep_ckpts = getattr(hps.train, 'keep_ckpts', 0)
263
- if keep_ckpts > 0:
264
- utils.clean_checkpoints(path_to_models=hps.model_dir, n_ckpts_to_keep=keep_ckpts, sort_by_time=True)
265
-
266
- global_step += 1
267
-
268
- if rank == 0:
269
- global start_time
270
- now = time.time()
271
- durtaion = format(now - start_time, '.2f')
272
- logger.info(f'====> Epoch: {epoch}, cost {durtaion} s')
273
- start_time = now
274
-
275
-
276
- def evaluate(hps, generator, eval_loader, writer_eval):
277
- generator.eval()
278
- image_dict = {}
279
- audio_dict = {}
280
- with torch.no_grad():
281
- for batch_idx, items in enumerate(eval_loader):
282
- c, f0, spec, y, spk, _, uv,volume = items
283
- g = spk[:1].cuda(0)
284
- spec, y = spec[:1].cuda(0), y[:1].cuda(0)
285
- c = c[:1].cuda(0)
286
- f0 = f0[:1].cuda(0)
287
- uv= uv[:1].cuda(0)
288
- if volume is not None:
289
- volume = volume[:1].cuda(0)
290
- mel = spec_to_mel_torch(
291
- spec,
292
- hps.data.filter_length,
293
- hps.data.n_mel_channels,
294
- hps.data.sampling_rate,
295
- hps.data.mel_fmin,
296
- hps.data.mel_fmax)
297
- y_hat,_ = generator.module.infer(c, f0, uv, g=g,vol = volume)
298
-
299
- y_hat_mel = mel_spectrogram_torch(
300
- y_hat.squeeze(1).float(),
301
- hps.data.filter_length,
302
- hps.data.n_mel_channels,
303
- hps.data.sampling_rate,
304
- hps.data.hop_length,
305
- hps.data.win_length,
306
- hps.data.mel_fmin,
307
- hps.data.mel_fmax
308
- )
309
-
310
- audio_dict.update({
311
- f"gen/audio_{batch_idx}": y_hat[0],
312
- f"gt/audio_{batch_idx}": y[0]
313
- })
314
- image_dict.update({
315
- "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()),
316
- "gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())
317
- })
318
- utils.summarize(
319
- writer=writer_eval,
320
- global_step=global_step,
321
- images=image_dict,
322
- audios=audio_dict,
323
- audio_sampling_rate=hps.data.sampling_rate
324
- )
325
- generator.train()
326
-
327
-
328
- if __name__ == "__main__":
329
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_diff.py DELETED
@@ -1,77 +0,0 @@
1
- import argparse
2
-
3
- import torch
4
- from loguru import logger
5
- from torch.optim import lr_scheduler
6
-
7
- from diffusion.data_loaders import get_data_loaders
8
- from diffusion.logger import utils
9
- from diffusion.solver import train
10
- from diffusion.unit2mel import Unit2Mel
11
- from diffusion.vocoder import Vocoder
12
-
13
-
14
- def parse_args(args=None, namespace=None):
15
- """Parse command-line arguments."""
16
- parser = argparse.ArgumentParser()
17
- parser.add_argument(
18
- "-c",
19
- "--config",
20
- type=str,
21
- required=True,
22
- help="path to the config file")
23
- return parser.parse_args(args=args, namespace=namespace)
24
-
25
-
26
- if __name__ == '__main__':
27
- # parse commands
28
- cmd = parse_args()
29
-
30
- # load config
31
- args = utils.load_config(cmd.config)
32
- logger.info(' > config:'+ cmd.config)
33
- logger.info(' > exp:'+ args.env.expdir)
34
-
35
- # load vocoder
36
- vocoder = Vocoder(args.vocoder.type, args.vocoder.ckpt, device=args.device)
37
-
38
- # load model
39
- model = Unit2Mel(
40
- args.data.encoder_out_channels,
41
- args.model.n_spk,
42
- args.model.use_pitch_aug,
43
- vocoder.dimension,
44
- args.model.n_layers,
45
- args.model.n_chans,
46
- args.model.n_hidden,
47
- args.model.timesteps,
48
- args.model.k_step_max
49
- )
50
-
51
- logger.info(f' > Now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}')
52
-
53
- # load parameters
54
- optimizer = torch.optim.AdamW(model.parameters())
55
- initial_global_step, model, optimizer = utils.load_model(args.env.expdir, model, optimizer, device=args.device)
56
- for param_group in optimizer.param_groups:
57
- param_group['initial_lr'] = args.train.lr
58
- param_group['lr'] = args.train.lr * (args.train.gamma ** max(((initial_global_step-2)//args.train.decay_step),0) )
59
- param_group['weight_decay'] = args.train.weight_decay
60
- scheduler = lr_scheduler.StepLR(optimizer, step_size=args.train.decay_step, gamma=args.train.gamma,last_epoch=initial_global_step-2)
61
-
62
- # device
63
- if args.device == 'cuda':
64
- torch.cuda.set_device(args.env.gpu_id)
65
- model.to(args.device)
66
-
67
- for state in optimizer.state.values():
68
- for k, v in state.items():
69
- if torch.is_tensor(v):
70
- state[k] = v.to(args.device)
71
-
72
- # datas
73
- loader_train, loader_valid = get_data_loaders(args, whole_audio=False)
74
-
75
- # run
76
- train(args, initial_global_step, model, optimizer, scheduler, vocoder, loader_train, loader_valid)
77
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_index.py DELETED
@@ -1,30 +0,0 @@
1
- import argparse
2
- import os
3
- import pickle
4
-
5
- import utils
6
-
7
- if __name__ == "__main__":
8
- parser = argparse.ArgumentParser()
9
- parser.add_argument(
10
- "--root_dir", type=str, default="dataset/44k", help="path to root dir"
11
- )
12
- parser.add_argument('-c', '--config', type=str, default="./configs/config.json",
13
- help='JSON file for configuration')
14
- parser.add_argument(
15
- "--output_dir", type=str, default="logs/44k", help="path to output dir"
16
- )
17
-
18
- args = parser.parse_args()
19
-
20
- hps = utils.get_hparams_from_file(args.config)
21
- spk_dic = hps.spk
22
- result = {}
23
-
24
- for k,v in spk_dic.items():
25
- print(f"now, index {k} feature...")
26
- index = utils.train_index(k,args.root_dir)
27
- result[v] = index
28
-
29
- with open(os.path.join(args.output_dir,"feature_and_index.pkl"),"wb") as f:
30
- pickle.dump(result,f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
trained/put_trained_checkpoints_here DELETED
File without changes
wav_upload.py DELETED
@@ -1,25 +0,0 @@
1
- import argparse
2
- import os
3
- import shutil
4
-
5
- from google.colab import files
6
-
7
- if __name__ == "__main__":
8
- parser = argparse.ArgumentParser()
9
- parser.add_argument("--type", type=str, required=True, help="type of file to upload")
10
- args = parser.parse_args()
11
- file_type = args.type
12
-
13
- basepath = os.getcwd()
14
- uploaded = files.upload() # 上传文件
15
- assert(file_type in ['zip', 'audio'])
16
- if file_type == "zip":
17
- upload_path = "./upload/"
18
- for filename in uploaded.keys():
19
- #将上传的文件移动到指定的位置上
20
- shutil.move(os.path.join(basepath, filename), os.path.join(upload_path, "userzip.zip"))
21
- elif file_type == "audio":
22
- upload_path = "./raw/"
23
- for filename in uploaded.keys():
24
- #将上传的文件移动到指定的位置上
25
- shutil.move(os.path.join(basepath, filename), os.path.join(upload_path, filename))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
webUI.py DELETED
@@ -1,430 +0,0 @@
1
- import glob
2
- import json
3
- import logging
4
- import os
5
- import re
6
- import subprocess
7
- import sys
8
- import time
9
- import traceback
10
- from itertools import chain
11
- from pathlib import Path
12
-
13
- # os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt")
14
- import gradio as gr
15
- import librosa
16
- import numpy as np
17
- import soundfile
18
- import torch
19
-
20
- from compress_model import removeOptimizer
21
- from edgetts.tts_voices import SUPPORTED_LANGUAGES
22
- from inference.infer_tool import Svc
23
- from utils import mix_model
24
-
25
- logging.getLogger('numba').setLevel(logging.WARNING)
26
- logging.getLogger('markdown_it').setLevel(logging.WARNING)
27
- logging.getLogger('urllib3').setLevel(logging.WARNING)
28
- logging.getLogger('matplotlib').setLevel(logging.WARNING)
29
- logging.getLogger('multipart').setLevel(logging.WARNING)
30
-
31
- model = None
32
- spk = None
33
- debug = False
34
-
35
- local_model_root = './trained'
36
-
37
- cuda = {}
38
- if torch.cuda.is_available():
39
- for i in range(torch.cuda.device_count()):
40
- device_name = torch.cuda.get_device_properties(i).name
41
- cuda[f"CUDA:{i} {device_name}"] = f"cuda:{i}"
42
-
43
- def upload_mix_append_file(files,sfiles):
44
- try:
45
- if(sfiles is None):
46
- file_paths = [file.name for file in files]
47
- else:
48
- file_paths = [file.name for file in chain(files,sfiles)]
49
- p = {file:100 for file in file_paths}
50
- return file_paths,mix_model_output1.update(value=json.dumps(p,indent=2))
51
- except Exception as e:
52
- if debug:
53
- traceback.print_exc()
54
- raise gr.Error(e)
55
-
56
- def mix_submit_click(js,mode):
57
- try:
58
- assert js.lstrip()!=""
59
- modes = {"凸组合":0, "线性组合":1}
60
- mode = modes[mode]
61
- data = json.loads(js)
62
- data = list(data.items())
63
- model_path,mix_rate = zip(*data)
64
- path = mix_model(model_path,mix_rate,mode)
65
- return f"成功,文件被保存在了{path}"
66
- except Exception as e:
67
- if debug:
68
- traceback.print_exc()
69
- raise gr.Error(e)
70
-
71
- def updata_mix_info(files):
72
- try:
73
- if files is None :
74
- return mix_model_output1.update(value="")
75
- p = {file.name:100 for file in files}
76
- return mix_model_output1.update(value=json.dumps(p,indent=2))
77
- except Exception as e:
78
- if debug:
79
- traceback.print_exc()
80
- raise gr.Error(e)
81
-
82
- def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance,diff_model_path,diff_config_path,only_diffusion,use_spk_mix,local_model_enabled,local_model_selection):
83
- global model
84
- try:
85
- device = cuda[device] if "CUDA" in device else device
86
- cluster_filepath = os.path.split(cluster_model_path.name) if cluster_model_path is not None else "no_cluster"
87
- # get model and config path
88
- if (local_model_enabled):
89
- # local path
90
- model_path = glob.glob(os.path.join(local_model_selection, '*.pth'))[0]
91
- config_path = glob.glob(os.path.join(local_model_selection, '*.json'))[0]
92
- else:
93
- # upload from webpage
94
- model_path = model_path.name
95
- config_path = config_path.name
96
- fr = ".pkl" in cluster_filepath[1]
97
- model = Svc(model_path,
98
- config_path,
99
- device=device if device != "Auto" else None,
100
- cluster_model_path = cluster_model_path.name if cluster_model_path is not None else "",
101
- nsf_hifigan_enhance=enhance,
102
- diffusion_model_path = diff_model_path.name if diff_model_path is not None else "",
103
- diffusion_config_path = diff_config_path.name if diff_config_path is not None else "",
104
- shallow_diffusion = True if diff_model_path is not None else False,
105
- only_diffusion = only_diffusion,
106
- spk_mix_enable = use_spk_mix,
107
- feature_retrieval = fr
108
- )
109
- spks = list(model.spk2id.keys())
110
- device_name = torch.cuda.get_device_properties(model.dev).name if "cuda" in str(model.dev) else str(model.dev)
111
- msg = f"成功加载模型到设备{device_name}上\n"
112
- if cluster_model_path is None:
113
- msg += "未加载聚类模型或特征检索模型\n"
114
- elif fr:
115
- msg += f"特征检索模型{cluster_filepath[1]}加载成功\n"
116
- else:
117
- msg += f"聚类模型{cluster_filepath[1]}加载成功\n"
118
- if diff_model_path is None:
119
- msg += "未加载扩散模型\n"
120
- else:
121
- msg += f"扩散模型{diff_model_path.name}加载成功\n"
122
- msg += "当前模型的可用音色:\n"
123
- for i in spks:
124
- msg += i + " "
125
- return sid.update(choices = spks,value=spks[0]), msg
126
- except Exception as e:
127
- if debug:
128
- traceback.print_exc()
129
- raise gr.Error(e)
130
-
131
-
132
- def modelUnload():
133
- global model
134
- if model is None:
135
- return sid.update(choices = [],value=""),"没有模型需要卸载!"
136
- else:
137
- model.unload_model()
138
- model = None
139
- torch.cuda.empty_cache()
140
- return sid.update(choices = [],value=""),"模型卸载完毕!"
141
-
142
- def vc_infer(output_format, sid, audio_path, truncated_basename, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment):
143
- global model
144
- _audio = model.slice_inference(
145
- audio_path,
146
- sid,
147
- vc_transform,
148
- slice_db,
149
- cluster_ratio,
150
- auto_f0,
151
- noise_scale,
152
- pad_seconds,
153
- cl_num,
154
- lg_num,
155
- lgr_num,
156
- f0_predictor,
157
- enhancer_adaptive_key,
158
- cr_threshold,
159
- k_step,
160
- use_spk_mix,
161
- second_encoding,
162
- loudness_envelope_adjustment
163
- )
164
- model.clear_empty()
165
- #构建保存文件的路径,并保存到results文件夹内
166
- str(int(time.time()))
167
- if not os.path.exists("results"):
168
- os.makedirs("results")
169
- key = "auto" if auto_f0 else f"{int(vc_transform)}key"
170
- cluster = "_" if cluster_ratio == 0 else f"_{cluster_ratio}_"
171
- isdiffusion = "sovits"
172
- if model.shallow_diffusion:
173
- isdiffusion = "sovdiff"
174
-
175
- if model.only_diffusion:
176
- isdiffusion = "diff"
177
-
178
- output_file_name = 'result_'+truncated_basename+f'_{sid}_{key}{cluster}{isdiffusion}.{output_format}'
179
- output_file = os.path.join("results", output_file_name)
180
- soundfile.write(output_file, _audio, model.target_sample, format=output_format)
181
- return output_file
182
-
183
- def vc_fn(sid, input_audio, output_format, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment):
184
- global model
185
- try:
186
- if input_audio is None:
187
- return "You need to upload an audio", None
188
- if model is None:
189
- return "You need to upload an model", None
190
- if getattr(model, 'cluster_model', None) is None and model.feature_retrieval is False:
191
- if cluster_ratio != 0:
192
- return "You need to upload an cluster model or feature retrieval model before assigning cluster ratio!", None
193
- #print(input_audio)
194
- audio, sampling_rate = soundfile.read(input_audio)
195
- #print(audio.shape,sampling_rate)
196
- if np.issubdtype(audio.dtype, np.integer):
197
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
198
- #print(audio.dtype)
199
- if len(audio.shape) > 1:
200
- audio = librosa.to_mono(audio.transpose(1, 0))
201
- # 未知原因Gradio上传的filepath会有一个奇怪的固定后缀,这里去掉
202
- truncated_basename = Path(input_audio).stem[:-6]
203
- processed_audio = os.path.join("raw", f"{truncated_basename}.wav")
204
- soundfile.write(processed_audio, audio, sampling_rate, format="wav")
205
- output_file = vc_infer(output_format, sid, processed_audio, truncated_basename, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment)
206
-
207
- return "Success", output_file
208
- except Exception as e:
209
- if debug:
210
- traceback.print_exc()
211
- raise gr.Error(e)
212
-
213
- def text_clear(text):
214
- return re.sub(r"[\n\,\(\) ]", "", text)
215
-
216
- def vc_fn2(_text, _lang, _gender, _rate, _volume, sid, output_format, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold, k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment):
217
- global model
218
- try:
219
- if model is None:
220
- return "You need to upload an model", None
221
- if getattr(model, 'cluster_model', None) is None and model.feature_retrieval is False:
222
- if cluster_ratio != 0:
223
- return "You need to upload an cluster model or feature retrieval model before assigning cluster ratio!", None
224
- _rate = f"+{int(_rate*100)}%" if _rate >= 0 else f"{int(_rate*100)}%"
225
- _volume = f"+{int(_volume*100)}%" if _volume >= 0 else f"{int(_volume*100)}%"
226
- if _lang == "Auto":
227
- _gender = "Male" if _gender == "男" else "Female"
228
- subprocess.run([sys.executable, "edgetts/tts.py", _text, _lang, _rate, _volume, _gender])
229
- else:
230
- subprocess.run([sys.executable, "edgetts/tts.py", _text, _lang, _rate, _volume])
231
- target_sr = 44100
232
- y, sr = librosa.load("tts.wav")
233
- resampled_y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
234
- soundfile.write("tts.wav", resampled_y, target_sr, subtype = "PCM_16")
235
- input_audio = "tts.wav"
236
- #audio, _ = soundfile.read(input_audio)
237
- output_file_path = vc_infer(output_format, sid, input_audio, "tts", vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment)
238
- os.remove("tts.wav")
239
- return "Success", output_file_path
240
- except Exception as e:
241
- if debug: traceback.print_exc() # noqa: E701
242
- raise gr.Error(e)
243
-
244
- def model_compression(_model):
245
- if _model == "":
246
- return "请先选择要压缩的模型"
247
- else:
248
- model_path = os.path.split(_model.name)
249
- filename, extension = os.path.splitext(model_path[1])
250
- output_model_name = f"{filename}_compressed{extension}"
251
- output_path = os.path.join(os.getcwd(), output_model_name)
252
- removeOptimizer(_model.name, output_path)
253
- return f"模型已成功被保存在了{output_path}"
254
-
255
- def scan_local_models():
256
- res = []
257
- candidates = glob.glob(os.path.join(local_model_root, '**', '*.json'), recursive=True)
258
- candidates = set([os.path.dirname(c) for c in candidates])
259
- for candidate in candidates:
260
- jsons = glob.glob(os.path.join(candidate, '*.json'))
261
- pths = glob.glob(os.path.join(candidate, '*.pth'))
262
- if (len(jsons) == 1 and len(pths) == 1):
263
- # must contain exactly one json and one pth file
264
- res.append(candidate)
265
- return res
266
-
267
- def local_model_refresh_fn():
268
- choices = scan_local_models()
269
- return gr.Dropdown.update(choices=choices)
270
-
271
- def debug_change():
272
- global debug
273
- debug = debug_button.value
274
-
275
- with gr.Blocks(
276
- theme=gr.themes.Base(
277
- primary_hue = gr.themes.colors.green,
278
- font=["Source Sans Pro", "Arial", "sans-serif"],
279
- font_mono=['JetBrains mono', "Consolas", 'Courier New']
280
- ),
281
- ) as app:
282
- with gr.Tabs():
283
- with gr.TabItem("推理"):
284
- gr.Markdown(value="""
285
- So-vits-svc 4.0 推理 webui
286
- """)
287
- with gr.Row(variant="panel"):
288
- with gr.Column():
289
- gr.Markdown(value="""
290
- <font size=2> 模型设置</font>
291
- """)
292
- with gr.Tabs():
293
- # invisible checkbox that tracks tab status
294
- local_model_enabled = gr.Checkbox(value=False, visible=False)
295
- with gr.TabItem('上传') as local_model_tab_upload:
296
- with gr.Row():
297
- model_path = gr.File(label="选择模型文件")
298
- config_path = gr.File(label="选择配置文件")
299
- with gr.TabItem('本地') as local_model_tab_local:
300
- gr.Markdown(f'模型应当放置于{local_model_root}文件夹下')
301
- local_model_refresh_btn = gr.Button('刷新本地模型列表')
302
- local_model_selection = gr.Dropdown(label='选择模型文件夹', choices=[], interactive=True)
303
- with gr.Row():
304
- diff_model_path = gr.File(label="选择扩散模型文件")
305
- diff_config_path = gr.File(label="选择扩散模型配置文件")
306
- cluster_model_path = gr.File(label="选择聚类模型或特征检索文件(没有可以不选)")
307
- device = gr.Dropdown(label="推理设备,默认为自动选择CPU和GPU", choices=["Auto",*cuda.keys(),"cpu"], value="Auto")
308
- enhance = gr.Checkbox(label="是否使用NSF_HIFIGAN增强,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭", value=False)
309
- only_diffusion = gr.Checkbox(label="是否使用全扩散推理,开启后将不使用So-VITS模型,仅使用扩散模型进行完整扩散推理,默认关闭", value=False)
310
- with gr.Column():
311
- gr.Markdown(value="""
312
- <font size=3>左侧文件全部选择完毕后(全部文件模块显示download),点击“加载模型”进行解析:</font>
313
- """)
314
- model_load_button = gr.Button(value="加载模型", variant="primary")
315
- model_unload_button = gr.Button(value="卸载模型", variant="primary")
316
- sid = gr.Dropdown(label="音色(说话人)")
317
- sid_output = gr.Textbox(label="Output Message")
318
-
319
-
320
- with gr.Row(variant="panel"):
321
- with gr.Column():
322
- gr.Markdown(value="""
323
- <font size=2> 推理设置</font>
324
- """)
325
- auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声勾选此项会究极跑调)", value=False)
326
- f0_predictor = gr.Dropdown(label="选择F0预测器,可选择crepe,pm,dio,harvest,rmvpe,默认为pm(注意:crepe为原F0使用均值滤波器)", choices=["pm","dio","harvest","crepe","rmvpe"], value="pm")
327
- vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
328
- cluster_ratio = gr.Number(label="聚类模型/特征检索混合比例,0-1之间,0即不启用聚类/特征检索。使用聚类/特征检索能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0)
329
- slice_db = gr.Number(label="切片阈值", value=-40)
330
- output_format = gr.Radio(label="音频输出格式", choices=["wav", "flac", "mp3"], value = "wav")
331
- noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4)
332
- k_step = gr.Slider(label="浅扩散步数,只有使用了扩散模型才有效,步数越大越接近扩散模型的结果", value=100, minimum = 1, maximum = 1000)
333
- with gr.Column():
334
- pad_seconds = gr.Number(label="推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现", value=0.5)
335
- cl_num = gr.Number(label="音频自动切片,0为不切片,单位为秒(s)", value=0)
336
- lg_num = gr.Number(label="两端音频切片的交叉淡入长度,如果自动切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,注意,该设置会影响推理速度,单位为秒/s", value=0)
337
- lgr_num = gr.Number(label="自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭", value=0.75)
338
- enhancer_adaptive_key = gr.Number(label="使增强器适应更高的音域(单位为半音数)|默认为0", value=0)
339
- cr_threshold = gr.Number(label="F0过滤阈值,只有启动crepe时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音", value=0.05)
340
- loudness_envelope_adjustment = gr.Number(label="输入源响度包络替换输出响度包络融合比例,越靠近1越使用输出响度包络", value = 0)
341
- second_encoding = gr.Checkbox(label = "二次编码,浅扩散前会对原始音频进行二次编码,玄学选项,效果时好时差,默认关闭", value=False)
342
- use_spk_mix = gr.Checkbox(label = "动态声线融合", value = False, interactive = False)
343
- with gr.Tabs():
344
- with gr.TabItem("音频转音频"):
345
- vc_input3 = gr.Audio(label="选择音频", type="filepath")
346
- vc_submit = gr.Button("音频转换", variant="primary")
347
- with gr.TabItem("文字转音频"):
348
- text2tts=gr.Textbox(label="在此输入要转译的文字。注意,使用该功能建议打开F0预测,不然会很怪")
349
- with gr.Row():
350
- tts_gender = gr.Radio(label = "说话人性别", choices = ["男","女"], value = "男")
351
- tts_lang = gr.Dropdown(label = "选择语言,Auto为根据输入文字自动识别", choices=SUPPORTED_LANGUAGES, value = "Auto")
352
- tts_rate = gr.Slider(label = "TTS语音变速(倍速相对值)", minimum = -1, maximum = 3, value = 0, step = 0.1)
353
- tts_volume = gr.Slider(label = "TTS语音音量(相对值)", minimum = -1, maximum = 1.5, value = 0, step = 0.1)
354
- vc_submit2 = gr.Button("文字转换", variant="primary")
355
- with gr.Row():
356
- with gr.Column():
357
- vc_output1 = gr.Textbox(label="Output Message")
358
- with gr.Column():
359
- vc_output2 = gr.Audio(label="Output Audio", interactive=False)
360
-
361
- with gr.TabItem("小工具/实验室特性"):
362
- gr.Markdown(value="""
363
- <font size=2> So-vits-svc 4.0 小工具/实验室特性</font>
364
- """)
365
- with gr.Tabs():
366
- with gr.TabItem("静态声线融合"):
367
- gr.Markdown(value="""
368
- <font size=2> 介绍:该功能可以将多个声音模型合成为一个声音模型(多个模型参数的凸组合或线性组合),从而制造出现实中不存在的声线
369
- 注意:
370
- 1.该功能仅支持单说话人的模型
371
- 2.如果强行使用多说话人模型,需要保证多个模型的说话人数量相同,这样可以混合同一个SpaekerID下的声音
372
- 3.保证所有待混合模型的config.json中的model字段是相同的
373
- 4.输出的混合模型可以使用待合成模型的任意一个config.json,但聚类模型将不能使用
374
- 5.批量上传模型��时候最好把模型放到一个文件夹选中后一起上传
375
- 6.混合比例调整建议大小在0-100之间,也可以调为其他数字,但在线性组合模式下会出现未知的效果
376
- 7.混合完毕后,文件将会保存在项目根目录中,文件名为output.pth
377
- 8.凸组合模式会将混合比例执行Softmax使混合比例相加为1,而线性组合模式不会
378
- </font>
379
- """)
380
- mix_model_path = gr.Files(label="选择需要混合模型文件")
381
- mix_model_upload_button = gr.UploadButton("选择/追加需要混合模型文件", file_count="multiple")
382
- mix_model_output1 = gr.Textbox(
383
- label="混合比例调整,单位/%",
384
- interactive = True
385
- )
386
- mix_mode = gr.Radio(choices=["凸组合", "线性组合"], label="融合模式",value="凸组合",interactive = True)
387
- mix_submit = gr.Button("声线融合启动", variant="primary")
388
- mix_model_output2 = gr.Textbox(
389
- label="Output Message"
390
- )
391
- mix_model_path.change(updata_mix_info,[mix_model_path],[mix_model_output1])
392
- mix_model_upload_button.upload(upload_mix_append_file, [mix_model_upload_button,mix_model_path], [mix_model_path,mix_model_output1])
393
- mix_submit.click(mix_submit_click, [mix_model_output1,mix_mode], [mix_model_output2])
394
-
395
- with gr.TabItem("模型压缩工具"):
396
- gr.Markdown(value="""
397
- 该工具可以实现对模型的体积压缩,在**不影响模型推理功能**的情况下,将原本约600M的So-VITS模型压缩至约200M, 大大减少了硬盘的压力。
398
- **注意:压缩后的模型将无法继续训练,请在确认封炉后再压缩。**
399
- """)
400
- model_to_compress = gr.File(label="模型上传")
401
- compress_model_btn = gr.Button("压缩模型", variant="primary")
402
- compress_model_output = gr.Textbox(label="输出信息", value="")
403
-
404
- compress_model_btn.click(model_compression, [model_to_compress], [compress_model_output])
405
-
406
-
407
- with gr.Tabs():
408
- with gr.Row(variant="panel"):
409
- with gr.Column():
410
- gr.Markdown(value="""
411
- <font size=2> WebUI设置</font>
412
- """)
413
- debug_button = gr.Checkbox(label="Debug模式,如果向社区反馈BUG需要打开,打开后控制台可以显示具体错误提示", value=debug)
414
- # refresh local model list
415
- local_model_refresh_btn.click(local_model_refresh_fn, outputs=local_model_selection)
416
- # set local enabled/disabled on tab switch
417
- local_model_tab_upload.select(lambda: False, outputs=local_model_enabled)
418
- local_model_tab_local.select(lambda: True, outputs=local_model_enabled)
419
-
420
- vc_submit.click(vc_fn, [sid, vc_input3, output_format, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2])
421
- vc_submit2.click(vc_fn2, [text2tts, tts_lang, tts_gender, tts_rate, tts_volume, sid, output_format, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2])
422
-
423
- debug_button.change(debug_change,[],[])
424
- model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance,diff_model_path,diff_config_path,only_diffusion,use_spk_mix,local_model_enabled,local_model_selection],[sid,sid_output])
425
- model_unload_button.click(modelUnload,[],[sid,sid_output])
426
- os.system("start http://127.0.0.1:7860")
427
- app.launch()
428
-
429
-
430
-