Spaces:
Runtime error
Runtime error
Delete configs_template
Browse files
configs_template/config_template.json
DELETED
@@ -1,79 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"train": {
|
3 |
-
"log_interval": 200,
|
4 |
-
"eval_interval": 800,
|
5 |
-
"seed": 1234,
|
6 |
-
"epochs": 10000,
|
7 |
-
"learning_rate": 0.0001,
|
8 |
-
"betas": [
|
9 |
-
0.8,
|
10 |
-
0.99
|
11 |
-
],
|
12 |
-
"eps": 1e-09,
|
13 |
-
"batch_size": 6,
|
14 |
-
"fp16_run": false,
|
15 |
-
"half_type": "fp16",
|
16 |
-
"lr_decay": 0.999875,
|
17 |
-
"segment_size": 10240,
|
18 |
-
"init_lr_ratio": 1,
|
19 |
-
"warmup_epochs": 0,
|
20 |
-
"c_mel": 45,
|
21 |
-
"c_kl": 1.0,
|
22 |
-
"use_sr": true,
|
23 |
-
"max_speclen": 512,
|
24 |
-
"port": "8001",
|
25 |
-
"keep_ckpts": 3,
|
26 |
-
"all_in_mem": false,
|
27 |
-
"vol_aug":false
|
28 |
-
},
|
29 |
-
"data": {
|
30 |
-
"training_files": "filelists/train.txt",
|
31 |
-
"validation_files": "filelists/val.txt",
|
32 |
-
"max_wav_value": 32768.0,
|
33 |
-
"sampling_rate": 44100,
|
34 |
-
"filter_length": 2048,
|
35 |
-
"hop_length": 512,
|
36 |
-
"win_length": 2048,
|
37 |
-
"n_mel_channels": 80,
|
38 |
-
"mel_fmin": 0.0,
|
39 |
-
"mel_fmax": 22050,
|
40 |
-
"unit_interpolate_mode":"nearest"
|
41 |
-
},
|
42 |
-
"model": {
|
43 |
-
"inter_channels": 192,
|
44 |
-
"hidden_channels": 192,
|
45 |
-
"filter_channels": 768,
|
46 |
-
"n_heads": 2,
|
47 |
-
"n_layers": 6,
|
48 |
-
"kernel_size": 3,
|
49 |
-
"p_dropout": 0.1,
|
50 |
-
"resblock": "1",
|
51 |
-
"resblock_kernel_sizes": [3,7,11],
|
52 |
-
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
53 |
-
"upsample_rates": [ 8, 8, 2, 2, 2],
|
54 |
-
"upsample_initial_channel": 512,
|
55 |
-
"upsample_kernel_sizes": [16,16, 4, 4, 4],
|
56 |
-
"n_layers_q": 3,
|
57 |
-
"n_layers_trans_flow": 3,
|
58 |
-
"n_flow_layer": 4,
|
59 |
-
"use_spectral_norm": false,
|
60 |
-
"gin_channels": 768,
|
61 |
-
"ssl_dim": 768,
|
62 |
-
"n_speakers": 200,
|
63 |
-
"vocoder_name":"nsf-hifigan",
|
64 |
-
"speech_encoder":"vec768l12",
|
65 |
-
"speaker_embedding":false,
|
66 |
-
"vol_embedding":false,
|
67 |
-
"use_depthwise_conv":false,
|
68 |
-
"flow_share_parameter": false,
|
69 |
-
"use_automatic_f0_prediction": true,
|
70 |
-
"use_transformer_flow": false
|
71 |
-
},
|
72 |
-
"spk": {
|
73 |
-
"nyaru": 0,
|
74 |
-
"huiyu": 1,
|
75 |
-
"nen": 2,
|
76 |
-
"paimon": 3,
|
77 |
-
"yunhao": 4
|
78 |
-
}
|
79 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs_template/config_tiny_template.json
DELETED
@@ -1,79 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"train": {
|
3 |
-
"log_interval": 200,
|
4 |
-
"eval_interval": 800,
|
5 |
-
"seed": 1234,
|
6 |
-
"epochs": 10000,
|
7 |
-
"learning_rate": 0.0001,
|
8 |
-
"betas": [
|
9 |
-
0.8,
|
10 |
-
0.99
|
11 |
-
],
|
12 |
-
"eps": 1e-09,
|
13 |
-
"batch_size": 6,
|
14 |
-
"fp16_run": false,
|
15 |
-
"half_type": "fp16",
|
16 |
-
"lr_decay": 0.999875,
|
17 |
-
"segment_size": 10240,
|
18 |
-
"init_lr_ratio": 1,
|
19 |
-
"warmup_epochs": 0,
|
20 |
-
"c_mel": 45,
|
21 |
-
"c_kl": 1.0,
|
22 |
-
"use_sr": true,
|
23 |
-
"max_speclen": 512,
|
24 |
-
"port": "8001",
|
25 |
-
"keep_ckpts": 3,
|
26 |
-
"all_in_mem": false,
|
27 |
-
"vol_aug":false
|
28 |
-
},
|
29 |
-
"data": {
|
30 |
-
"training_files": "filelists/train.txt",
|
31 |
-
"validation_files": "filelists/val.txt",
|
32 |
-
"max_wav_value": 32768.0,
|
33 |
-
"sampling_rate": 44100,
|
34 |
-
"filter_length": 2048,
|
35 |
-
"hop_length": 512,
|
36 |
-
"win_length": 2048,
|
37 |
-
"n_mel_channels": 80,
|
38 |
-
"mel_fmin": 0.0,
|
39 |
-
"mel_fmax": 22050,
|
40 |
-
"unit_interpolate_mode":"nearest"
|
41 |
-
},
|
42 |
-
"model": {
|
43 |
-
"inter_channels": 192,
|
44 |
-
"hidden_channels": 192,
|
45 |
-
"filter_channels": 512,
|
46 |
-
"n_heads": 2,
|
47 |
-
"n_layers": 6,
|
48 |
-
"kernel_size": 3,
|
49 |
-
"p_dropout": 0.1,
|
50 |
-
"resblock": "1",
|
51 |
-
"resblock_kernel_sizes": [3,7,11],
|
52 |
-
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
53 |
-
"upsample_rates": [ 8, 8, 2, 2, 2],
|
54 |
-
"upsample_initial_channel": 400,
|
55 |
-
"upsample_kernel_sizes": [16,16, 4, 4, 4],
|
56 |
-
"n_layers_q": 3,
|
57 |
-
"n_layers_trans_flow": 3,
|
58 |
-
"n_flow_layer": 4,
|
59 |
-
"use_spectral_norm": false,
|
60 |
-
"gin_channels": 768,
|
61 |
-
"ssl_dim": 768,
|
62 |
-
"n_speakers": 200,
|
63 |
-
"vocoder_name":"nsf-hifigan",
|
64 |
-
"speech_encoder":"vec768l12",
|
65 |
-
"speaker_embedding":false,
|
66 |
-
"vol_embedding":false,
|
67 |
-
"use_depthwise_conv":true,
|
68 |
-
"flow_share_parameter": true,
|
69 |
-
"use_automatic_f0_prediction": true,
|
70 |
-
"use_transformer_flow": false
|
71 |
-
},
|
72 |
-
"spk": {
|
73 |
-
"nyaru": 0,
|
74 |
-
"huiyu": 1,
|
75 |
-
"nen": 2,
|
76 |
-
"paimon": 3,
|
77 |
-
"yunhao": 4
|
78 |
-
}
|
79 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs_template/diffusion_template.yaml
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
data:
|
2 |
-
sampling_rate: 44100
|
3 |
-
block_size: 512 # Equal to hop_length
|
4 |
-
duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
|
5 |
-
encoder: 'vec768l12' # 'hubertsoft', 'vec256l9', 'vec768l12'
|
6 |
-
cnhubertsoft_gate: 10
|
7 |
-
encoder_sample_rate: 16000
|
8 |
-
encoder_hop_size: 320
|
9 |
-
encoder_out_channels: 768 # 256 if using 'hubertsoft'
|
10 |
-
training_files: "filelists/train.txt"
|
11 |
-
validation_files: "filelists/val.txt"
|
12 |
-
extensions: # List of extension included in the data collection
|
13 |
-
- wav
|
14 |
-
unit_interpolate_mode: "nearest"
|
15 |
-
model:
|
16 |
-
type: 'Diffusion'
|
17 |
-
n_layers: 20
|
18 |
-
n_chans: 512
|
19 |
-
n_hidden: 256
|
20 |
-
use_pitch_aug: true
|
21 |
-
timesteps : 1000
|
22 |
-
k_step_max: 0 # must <= timesteps, If it is 0, train all
|
23 |
-
n_spk: 1 # max number of different speakers
|
24 |
-
device: cuda
|
25 |
-
vocoder:
|
26 |
-
type: 'nsf-hifigan'
|
27 |
-
ckpt: 'pretrain/nsf_hifigan/model'
|
28 |
-
infer:
|
29 |
-
speedup: 10
|
30 |
-
method: 'dpm-solver++' # 'pndm' or 'dpm-solver' or 'ddim' or 'unipc' or 'dpm-solver++'
|
31 |
-
env:
|
32 |
-
expdir: logs/44k/diffusion
|
33 |
-
gpu_id: 0
|
34 |
-
train:
|
35 |
-
num_workers: 4 # If your cpu and gpu are both very strong, set to 0 may be faster!
|
36 |
-
amp_dtype: fp32 # fp32, fp16 or bf16 (fp16 or bf16 may be faster if it is supported by your gpu)
|
37 |
-
batch_size: 48
|
38 |
-
cache_all_data: true # Save Internal-Memory or Graphics-Memory if it is false, but may be slow
|
39 |
-
cache_device: 'cpu' # Set to 'cuda' to cache the data into the Graphics-Memory, fastest speed for strong gpu
|
40 |
-
cache_fp16: true
|
41 |
-
epochs: 100000
|
42 |
-
interval_log: 10
|
43 |
-
interval_val: 2000
|
44 |
-
interval_force_save: 5000
|
45 |
-
lr: 0.0001
|
46 |
-
decay_step: 100000
|
47 |
-
gamma: 0.5
|
48 |
-
weight_decay: 0
|
49 |
-
save_opt: false
|
50 |
-
spk:
|
51 |
-
'nyaru': 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|