subatomicseer commited on
Commit
b754b20
1 Parent(s): 283df5c

Upload 6 files

Browse files
ckpt/config.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 500,
4
+ "eval_interval": 10000,
5
+ "save_interval": 10000,
6
+ "seed": 1234,
7
+ "epochs": 1000,
8
+ "optimizer": "adamw",
9
+ "lr_decay_on": true,
10
+ "learning_rate": 5e-5,
11
+ "betas": [0.8, 0.99],
12
+ "eps": 1e-9,
13
+ "batch_size": 32,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 35840,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 1,
20
+ "c_diff": 1,
21
+ "aug": true,
22
+ "lambda_commit": 0.02
23
+ },
24
+ "data": {
25
+ "train_filelist_path": "libritts/train_wav_final.txt",
26
+ "test_filelist_path": "libritts/test_wav_final.txt",
27
+ "text_cleaners":["english_cleaners2"],
28
+ "max_wav_value": 32768.0,
29
+ "sampling_rate": 16000,
30
+ "filter_length": 1280,
31
+ "hop_length": 320,
32
+ "win_length": 1280,
33
+ "n_mel_channels": 80,
34
+ "mel_fmin": 0,
35
+ "mel_fmax": 8000,
36
+ "add_blank": true,
37
+ "n_speakers": 0,
38
+ "cleaned_text": true,
39
+ "aug_rate": 1.0,
40
+ "top_db": 20
41
+ },
42
+ "model": {
43
+ "inter_channels": 192,
44
+ "hidden_channels": 192,
45
+ "filter_channels": 768,
46
+ "n_heads": 2,
47
+ "n_layers": 6,
48
+ "kernel_size": 3,
49
+ "p_dropout": 0.1,
50
+ "resblock": "1",
51
+ "resblock_kernel_sizes": [3,7,11],
52
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
53
+ "upsample_rates": [5,4,4,2,2],
54
+ "upsample_initial_channel": 512,
55
+ "upsample_kernel_sizes": [11,8,8,4,4],
56
+ "mixup_ratio": 0.6,
57
+ "n_layers_q": 3,
58
+ "use_spectral_norm": false,
59
+ "encoder_hidden_size": 128
60
+ },
61
+ "f0_vq_params": {
62
+ "l_bins": 20,
63
+ "emb_width": 128,
64
+ "mu": 0.99,
65
+ "levels": 1
66
+ },
67
+ "f0_encoder_params": {
68
+ "input_emb_width": 1,
69
+ "output_emb_width": 128,
70
+ "levels": 1,
71
+ "downs_t": [4],
72
+ "strides_t": [2],
73
+ "width": 32,
74
+ "depth": 4,
75
+ "m_conv": 1.0,
76
+ "dilation_growth_rate": 3
77
+ },
78
+ "f0_decoder_params": {
79
+ "input_emb_width": 1,
80
+ "output_emb_width": 128,
81
+ "levels": 1,
82
+ "downs_t": [4],
83
+ "strides_t": [2],
84
+ "width": 32,
85
+ "depth": 4,
86
+ "m_conv": 1.0,
87
+ "dilation_growth_rate": 3
88
+ },
89
+ "diffusion" : {
90
+ "dec_dim" : 128,
91
+ "spk_dim" : 128,
92
+ "use_ref_t" : false,
93
+ "beta_min" : 0.05,
94
+ "beta_max" : 20.0
95
+ }
96
+ }
ckpt/model_base.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ee57ff7ff8b8c87baa44ca17f08d447785379d59ad7453aa930ef68b17562ac
3
+ size 794073776
f0_vqvae/config.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 10000,
5
+ "save_interval": 10000,
6
+ "seed": 1234,
7
+ "epochs": 20000,
8
+ "learning_rate": 2e-4,
9
+ "betas": [0.8, 0.99],
10
+ "eps": 1e-9,
11
+ "batch_size": 16,
12
+ "fp16_run": false,
13
+ "lr_decay": 0.999875,
14
+ "segment_size": 16640,
15
+ "init_lr_ratio": 1,
16
+ "warmup_epochs": 0,
17
+ "c_mel": 45,
18
+ "c_kl": 1,
19
+ "c_kl_pitch": 1,
20
+ "c_pho": 45.0,
21
+ "fs": true,
22
+ "lambda_commit": 0.02
23
+ },
24
+ "data": {
25
+ "train_filelist_path": "filelists/train_f0_final.txt",
26
+ "test_filelist_path": "filelists/test_f0_final.txt",
27
+ "text_cleaners":["english_cleaners2"],
28
+ "max_wav_value": 32768.0,
29
+ "sampling_rate": 16000,
30
+ "filter_length": 1280,
31
+ "hop_length": 320,
32
+ "win_length": 1280,
33
+ "n_mel_channels": 80,
34
+ "mel_fmin": 0,
35
+ "mel_fmax": 8000,
36
+ "add_blank": true,
37
+ "n_speakers": 0,
38
+ "cleaned_text": true,
39
+ "aug_rate": 1.0,
40
+ "top_db": 20
41
+ },
42
+ "model": {
43
+ "inter_channels": 192,
44
+ "hidden_channels": 192,
45
+ "filter_channels": 768,
46
+ "n_heads": 2,
47
+ "n_layers": 6,
48
+ "kernel_size": 3,
49
+ "p_dropout": 0.1,
50
+ "resblock": "1",
51
+ "resblock_kernel_sizes": [3,7,11],
52
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
53
+ "upsample_rates": [5,4,4,2,2],
54
+ "upsample_initial_channel": 512,
55
+ "upsample_kernel_sizes": [11,8,8,4,4],
56
+ "mixup_ratio": 0.6,
57
+ "n_layers_q": 3,
58
+ "use_spectral_norm": false
59
+ },
60
+ "f0_vq_params": {
61
+ "l_bins": 20,
62
+ "emb_width": 128,
63
+ "mu": 0.99,
64
+ "levels": 1
65
+ },
66
+ "f0_encoder_params": {
67
+ "input_emb_width": 1,
68
+ "output_emb_width": 128,
69
+ "levels": 1,
70
+ "downs_t": [4],
71
+ "strides_t": [2],
72
+ "width": 32,
73
+ "depth": 4,
74
+ "m_conv": 1.0,
75
+ "dilation_growth_rate": 3
76
+ },
77
+ "f0_decoder_params": {
78
+ "input_emb_width": 1,
79
+ "output_emb_width": 128,
80
+ "levels": 1,
81
+ "downs_t": [4],
82
+ "strides_t": [2],
83
+ "width": 32,
84
+ "depth": 4,
85
+ "m_conv": 1.0,
86
+ "dilation_growth_rate": 3
87
+ }
88
+ }
f0_vqvae/f0_vqvae.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f705566a58cc0358555359510b90654301950d203eab8cf8f7335254a12cbc79
3
+ size 2562299
vocoder/voc_bigvgan.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7169372d42c08aec46cdc639cdaa29cd2faa722138c477f843d4d7f1ebc22cb4
3
+ size 620828487
vocoder/voc_ckpt.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c8e7b8a55739edf6845ef37f4af4efa4af4f6bbf930842c28115a283ed3ee06
3
+ size 157373725