drewThomasson commited on
Commit
9b9c373
1 Parent(s): 67752b0

Upload 24 files

Browse files
Files changed (24) hide show
  1. local_coqui_tts_models/.DS_Store +0 -0
  2. local_coqui_tts_models/tts_models--ca--custom--vits/.DS_Store +0 -0
  3. local_coqui_tts_models/tts_models--ca--custom--vits/._config.json +0 -0
  4. local_coqui_tts_models/tts_models--ca--custom--vits/._model_file.pth +3 -0
  5. local_coqui_tts_models/tts_models--ca--custom--vits/._speaker_ids.pth +3 -0
  6. local_coqui_tts_models/tts_models--ca--custom--vits/__MACOSX/.DS_Store +0 -0
  7. local_coqui_tts_models/tts_models--ca--custom--vits/__MACOSX/tts_models--ca--custom--vits/._config.json +0 -0
  8. local_coqui_tts_models/tts_models--ca--custom--vits/__MACOSX/tts_models--ca--custom--vits/._model_file.pth +3 -0
  9. local_coqui_tts_models/tts_models--ca--custom--vits/__MACOSX/tts_models--ca--custom--vits/._speaker_ids.pth +3 -0
  10. local_coqui_tts_models/tts_models--ca--custom--vits/config.json +269 -0
  11. local_coqui_tts_models/tts_models--ca--custom--vits/model_file.pth +3 -0
  12. local_coqui_tts_models/tts_models--ca--custom--vits/speaker_ids.pth +3 -0
  13. local_coqui_tts_models/tts_models--en--vctk--fast_pitch/config.json +218 -0
  14. local_coqui_tts_models/tts_models--en--vctk--fast_pitch/model_file.pth +3 -0
  15. local_coqui_tts_models/tts_models--en--vctk--fast_pitch/speaker_ids.json +110 -0
  16. local_coqui_tts_models/tts_models--en--vctk--vits/config.json +504 -0
  17. local_coqui_tts_models/tts_models--en--vctk--vits/model_file.pth +3 -0
  18. local_coqui_tts_models/tts_models--en--vctk--vits/speaker_ids.json +111 -0
  19. local_coqui_tts_models/tts_models--multilingual--multi-dataset--xtts_v2/.DS_Store +0 -0
  20. local_coqui_tts_models/tts_models--multilingual--multi-dataset--xtts_v2/config.json +159 -0
  21. local_coqui_tts_models/tts_models--multilingual--multi-dataset--xtts_v2/hash.md5 +1 -0
  22. local_coqui_tts_models/tts_models--multilingual--multi-dataset--xtts_v2/model.pth +3 -0
  23. local_coqui_tts_models/tts_models--multilingual--multi-dataset--xtts_v2/tos_agreed.txt +0 -0
  24. local_coqui_tts_models/tts_models--multilingual--multi-dataset--xtts_v2/vocab.json +0 -0
local_coqui_tts_models/.DS_Store ADDED
Binary file (8.2 kB). View file
 
local_coqui_tts_models/tts_models--ca--custom--vits/.DS_Store ADDED
Binary file (6.15 kB). View file
 
local_coqui_tts_models/tts_models--ca--custom--vits/._config.json ADDED
Binary file (176 Bytes). View file
 
local_coqui_tts_models/tts_models--ca--custom--vits/._model_file.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bbc7856dd427df6aeb4b7900dc629272704cdd8dea679a151ea15122fccc77b
3
+ size 1548
local_coqui_tts_models/tts_models--ca--custom--vits/._speaker_ids.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6642567185879c93920f9e8ded216aac964f512dcdeb52b534dfbe1547cd01e2
3
+ size 1542
local_coqui_tts_models/tts_models--ca--custom--vits/__MACOSX/.DS_Store ADDED
Binary file (6.15 kB). View file
 
local_coqui_tts_models/tts_models--ca--custom--vits/__MACOSX/tts_models--ca--custom--vits/._config.json ADDED
Binary file (176 Bytes). View file
 
local_coqui_tts_models/tts_models--ca--custom--vits/__MACOSX/tts_models--ca--custom--vits/._model_file.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bbc7856dd427df6aeb4b7900dc629272704cdd8dea679a151ea15122fccc77b
3
+ size 1548
local_coqui_tts_models/tts_models--ca--custom--vits/__MACOSX/tts_models--ca--custom--vits/._speaker_ids.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6642567185879c93920f9e8ded216aac964f512dcdeb52b534dfbe1547cd01e2
3
+ size 1542
local_coqui_tts_models/tts_models--ca--custom--vits/config.json ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/experiments_from_previous",
3
+ "logger_uri": null,
4
+ "run_name": "multispeaker_vits_ca_1e4_1e4_32",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 25,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": 1000,
14
+ "save_step": 1000,
15
+ "save_n_checkpoints": 5,
16
+ "save_checkpoints": true,
17
+ "save_all_best": true,
18
+ "save_best_after": 10000,
19
+ "target_loss": null,
20
+ "print_eval": true,
21
+ "test_delay_epochs": -1,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 1000,
29
+ "batch_size": 16,
30
+ "eval_batch_size": 8,
31
+ "grad_clip": [
32
+ 1000.0,
33
+ 1000.0
34
+ ],
35
+ "scheduler_after_epoch": true,
36
+ "lr": 0.001,
37
+ "optimizer": "AdamW",
38
+ "optimizer_params": {
39
+ "betas": [
40
+ 0.8,
41
+ 0.99
42
+ ],
43
+ "eps": 1e-09,
44
+ "weight_decay": 0.01
45
+ },
46
+ "lr_scheduler": "",
47
+ "lr_scheduler_params": null,
48
+ "use_grad_scaler": false,
49
+ "allow_tf32": false,
50
+ "cudnn_enable": true,
51
+ "cudnn_deterministic": false,
52
+ "cudnn_benchmark": false,
53
+ "training_seed": 54321,
54
+ "model": "vits",
55
+ "num_loader_workers": 4,
56
+ "num_eval_loader_workers": 4,
57
+ "use_noise_augment": false,
58
+ "audio": {
59
+ "fft_size": 1024,
60
+ "sample_rate": 22050,
61
+ "win_length": 1024,
62
+ "hop_length": 256,
63
+ "num_mels": 80,
64
+ "mel_fmin": 0,
65
+ "mel_fmax": null
66
+ },
67
+ "use_phonemes": true,
68
+ "phonemizer": "espeak",
69
+ "phoneme_language": "ca",
70
+ "compute_input_seq_cache": true,
71
+ "text_cleaner": "multilingual_cleaners",
72
+ "enable_eos_bos_chars": false,
73
+ "test_sentences_file": "",
74
+ "phoneme_cache_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/phoneme_cache",
75
+ "characters": {
76
+ "characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
77
+ "vocab_dict": null,
78
+ "pad": "<PAD>",
79
+ "eos": "<EOS>",
80
+ "bos": "<BOS>",
81
+ "blank": "<BLNK>",
82
+ "characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
83
+ "punctuations": "!'(),-.:;? ",
84
+ "phonemes": null,
85
+ "is_unique": false,
86
+ "is_sorted": true
87
+ },
88
+ "add_blank": true,
89
+ "batch_group_size": 5,
90
+ "loss_masking": null,
91
+ "min_audio_len": 1,
92
+ "max_audio_len": Infinity,
93
+ "min_text_len": 1,
94
+ "max_text_len": 325,
95
+ "compute_f0": false,
96
+ "compute_energy": false,
97
+ "compute_linear_spec": true,
98
+ "precompute_num_workers": 0,
99
+ "start_by_longest": false,
100
+ "shuffle": false,
101
+ "drop_last": false,
102
+ "datasets": [
103
+ {
104
+ "formatter": "vctk_old",
105
+ "dataset_name": "vctk_old",
106
+ "path": "/gpfs/scratch/bsc88/bsc88474/data/multispeaker_ca",
107
+ "meta_file_train": "",
108
+ "ignored_speakers": [
109
+ "uri",
110
+ "09796",
111
+ "05450"
112
+ ],
113
+ "language": "ca",
114
+ "phonemizer": "",
115
+ "meta_file_val": "",
116
+ "meta_file_attn_mask": ""
117
+ }
118
+ ],
119
+ "test_sentences": [
120
+ [
121
+ "Per exemple, dels nostres bancs que inverteixen en armament de les nostres empreses."
122
+ ],
123
+ [
124
+ "Preguntin-se si aix\u00f2 era necessari."
125
+ ],
126
+ [
127
+ "La suposada ocultaci\u00f3 dels informes que advertien de risc s\u00edsmic."
128
+ ],
129
+ [
130
+ "\u00c9s de 633 milions d'euros quan es far\u00e0 la publicaci\u00f3 detallada."
131
+ ]
132
+ ],
133
+ "eval_split_max_size": null,
134
+ "eval_split_size": 0.01,
135
+ "use_speaker_weighted_sampler": false,
136
+ "speaker_weighted_sampler_alpha": 1.0,
137
+ "use_language_weighted_sampler": false,
138
+ "language_weighted_sampler_alpha": 1.0,
139
+ "use_length_weighted_sampler": false,
140
+ "length_weighted_sampler_alpha": 1.0,
141
+ "model_args": {
142
+ "num_chars": 131,
143
+ "out_channels": 513,
144
+ "spec_segment_size": 32,
145
+ "hidden_channels": 192,
146
+ "hidden_channels_ffn_text_encoder": 768,
147
+ "num_heads_text_encoder": 2,
148
+ "num_layers_text_encoder": 6,
149
+ "kernel_size_text_encoder": 3,
150
+ "dropout_p_text_encoder": 0.1,
151
+ "dropout_p_duration_predictor": 0.5,
152
+ "kernel_size_posterior_encoder": 5,
153
+ "dilation_rate_posterior_encoder": 1,
154
+ "num_layers_posterior_encoder": 16,
155
+ "kernel_size_flow": 5,
156
+ "dilation_rate_flow": 1,
157
+ "num_layers_flow": 4,
158
+ "resblock_type_decoder": "1",
159
+ "resblock_kernel_sizes_decoder": [
160
+ 3,
161
+ 7,
162
+ 11
163
+ ],
164
+ "resblock_dilation_sizes_decoder": [
165
+ [
166
+ 1,
167
+ 3,
168
+ 5
169
+ ],
170
+ [
171
+ 1,
172
+ 3,
173
+ 5
174
+ ],
175
+ [
176
+ 1,
177
+ 3,
178
+ 5
179
+ ]
180
+ ],
181
+ "upsample_rates_decoder": [
182
+ 8,
183
+ 8,
184
+ 2,
185
+ 2
186
+ ],
187
+ "upsample_initial_channel_decoder": 512,
188
+ "upsample_kernel_sizes_decoder": [
189
+ 16,
190
+ 16,
191
+ 4,
192
+ 4
193
+ ],
194
+ "periods_multi_period_discriminator": [
195
+ 2,
196
+ 3,
197
+ 5,
198
+ 7,
199
+ 11
200
+ ],
201
+ "use_sdp": true,
202
+ "noise_scale": 1.0,
203
+ "inference_noise_scale": 0.667,
204
+ "length_scale": 1.0,
205
+ "noise_scale_dp": 1.0,
206
+ "inference_noise_scale_dp": 1.0,
207
+ "max_inference_len": null,
208
+ "init_discriminator": true,
209
+ "use_spectral_norm_disriminator": false,
210
+ "use_speaker_embedding": true,
211
+ "num_speakers": 257,
212
+ "speakers_file": "/Users/drew/Library/Application Support/tts/tts_models--ca--custom--vits/speaker_ids.pth",
213
+ "d_vector_file": null,
214
+ "speaker_embedding_channels": 256,
215
+ "use_d_vector_file": false,
216
+ "d_vector_dim": 0,
217
+ "detach_dp_input": true,
218
+ "use_language_embedding": false,
219
+ "embedded_language_dim": 4,
220
+ "num_languages": 0,
221
+ "language_ids_file": null,
222
+ "use_speaker_encoder_as_loss": false,
223
+ "speaker_encoder_config_path": "",
224
+ "speaker_encoder_model_path": "",
225
+ "condition_dp_on_speaker": true,
226
+ "freeze_encoder": false,
227
+ "freeze_DP": false,
228
+ "freeze_PE": false,
229
+ "freeze_flow_decoder": false,
230
+ "freeze_waveform_decoder": false,
231
+ "encoder_sample_rate": null,
232
+ "interpolate_z": true,
233
+ "reinit_DP": false,
234
+ "reinit_text_encoder": false
235
+ },
236
+ "lr_gen": 0.0001,
237
+ "lr_disc": 0.0001,
238
+ "lr_scheduler_gen": "ExponentialLR",
239
+ "lr_scheduler_gen_params": {
240
+ "gamma": 0.999875,
241
+ "last_epoch": -1
242
+ },
243
+ "lr_scheduler_disc": "ExponentialLR",
244
+ "lr_scheduler_disc_params": {
245
+ "gamma": 0.999875,
246
+ "last_epoch": -1
247
+ },
248
+ "kl_loss_alpha": 1.0,
249
+ "disc_loss_alpha": 1.0,
250
+ "gen_loss_alpha": 1.0,
251
+ "feat_loss_alpha": 1.0,
252
+ "mel_loss_alpha": 45.0,
253
+ "dur_loss_alpha": 1.0,
254
+ "speaker_encoder_loss_alpha": 1.0,
255
+ "return_wav": true,
256
+ "use_weighted_sampler": false,
257
+ "weighted_sampler_attrs": null,
258
+ "weighted_sampler_multipliers": null,
259
+ "r": 1,
260
+ "num_speakers": 257,
261
+ "use_speaker_embedding": true,
262
+ "speakers_file": "/Users/drew/Library/Application Support/tts/tts_models--ca--custom--vits/speaker_ids.pth",
263
+ "speaker_embedding_channels": 256,
264
+ "language_ids_file": null,
265
+ "use_language_embedding": false,
266
+ "use_d_vector_file": false,
267
+ "d_vector_file": null,
268
+ "d_vector_dim": 0
269
+ }
local_coqui_tts_models/tts_models--ca--custom--vits/model_file.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b15fa7d2052bada1cf421e49d2d03b00e95b49fcd0e42b7af1d92da2880cdecc
3
+ size 1038659133
local_coqui_tts_models/tts_models--ca--custom--vits/speaker_ids.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dacda0b8dd3e111c5072f8f33c08b4a29b92ac79aaf22ceca912d01e7deb905
3
+ size 30191
local_coqui_tts_models/tts_models--en--vctk--fast_pitch/config.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/home/ubuntu/TTS/recipes/vctk/fast_pitch",
3
+ "logger_uri": null,
4
+ "run_name": "fast_pitch_ljspeech",
5
+ "project_name": null,
6
+ "run_description": "",
7
+ "print_step": 50,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": 10000,
14
+ "save_step": 10000,
15
+ "save_n_checkpoints": 5,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 10000,
19
+ "target_loss": null,
20
+ "print_eval": false,
21
+ "test_delay_epochs": -1,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 1000,
29
+ "batch_size": 32,
30
+ "eval_batch_size": 16,
31
+ "grad_clip": 5.0,
32
+ "scheduler_after_epoch": false,
33
+ "lr": 0.0001,
34
+ "optimizer": "Adam",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.9,
38
+ 0.998
39
+ ],
40
+ "weight_decay": 1e-06
41
+ },
42
+ "lr_scheduler": "NoamLR",
43
+ "lr_scheduler_params": {
44
+ "warmup_steps": 4000
45
+ },
46
+ "use_grad_scaler": false,
47
+ "allow_tf32": false,
48
+ "cudnn_enable": true,
49
+ "cudnn_deterministic": false,
50
+ "cudnn_benchmark": true,
51
+ "training_seed": 54321,
52
+ "model": "fast_pitch",
53
+ "num_loader_workers": 8,
54
+ "num_eval_loader_workers": 4,
55
+ "use_noise_augment": false,
56
+ "audio": {
57
+ "fft_size": 1024,
58
+ "win_length": 1024,
59
+ "hop_length": 256,
60
+ "frame_shift_ms": null,
61
+ "frame_length_ms": null,
62
+ "stft_pad_mode": "reflect",
63
+ "sample_rate": 22050,
64
+ "resample": false,
65
+ "preemphasis": 0.0,
66
+ "ref_level_db": 20,
67
+ "do_sound_norm": false,
68
+ "log_func": "np.log",
69
+ "do_trim_silence": true,
70
+ "trim_db": 23,
71
+ "do_rms_norm": false,
72
+ "db_level": null,
73
+ "power": 1.5,
74
+ "griffin_lim_iters": 60,
75
+ "num_mels": 80,
76
+ "mel_fmin": 0.0,
77
+ "mel_fmax": 8000.0,
78
+ "spec_gain": 1,
79
+ "do_amp_to_db_linear": true,
80
+ "do_amp_to_db_mel": true,
81
+ "pitch_fmax": 640.0,
82
+ "pitch_fmin": 0.0,
83
+ "signal_norm": false,
84
+ "min_level_db": -100,
85
+ "symmetric_norm": true,
86
+ "max_norm": 4.0,
87
+ "clip_norm": true,
88
+ "stats_path": null
89
+ },
90
+ "use_phonemes": true,
91
+ "phonemizer": "gruut",
92
+ "phoneme_language": "en-us",
93
+ "compute_input_seq_cache": true,
94
+ "text_cleaner": "english_cleaners",
95
+ "enable_eos_bos_chars": false,
96
+ "test_sentences_file": "",
97
+ "phoneme_cache_path": "/home/ubuntu/TTS/recipes/vctk/fast_pitch/phoneme_cache",
98
+ "characters": {
99
+ "characters_class": null,
100
+ "vocab_dict": null,
101
+ "pad": "_",
102
+ "eos": "~",
103
+ "bos": "^",
104
+ "blank": null,
105
+ "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
106
+ "punctuations": "!'(),-.:;? ",
107
+ "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
108
+ "is_unique": true,
109
+ "is_sorted": true
110
+ },
111
+ "add_blank": false,
112
+ "batch_group_size": 0,
113
+ "loss_masking": null,
114
+ "min_audio_len": 1,
115
+ "max_audio_len": Infinity,
116
+ "min_text_len": 1,
117
+ "max_text_len": Infinity,
118
+ "compute_f0": true,
119
+ "compute_energy": false,
120
+ "compute_linear_spec": false,
121
+ "precompute_num_workers": 0,
122
+ "start_by_longest": false,
123
+ "shuffle": false,
124
+ "drop_last": false,
125
+ "datasets": [
126
+ {
127
+ "formatter": "",
128
+ "dataset_name": "",
129
+ "path": "/home/ubuntu/TTS/VCTK-Corpus-removed-silence",
130
+ "meta_file_train": "",
131
+ "ignored_speakers": null,
132
+ "language": "",
133
+ "phonemizer": "",
134
+ "meta_file_val": "",
135
+ "meta_file_attn_mask": ""
136
+ }
137
+ ],
138
+ "test_sentences": [
139
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
140
+ "Be a voice, not an echo.",
141
+ "I'm sorry Dave. I'm afraid I can't do that.",
142
+ "This cake is great. It's so delicious and moist.",
143
+ "Prior to November 22, 1963."
144
+ ],
145
+ "eval_split_max_size": null,
146
+ "eval_split_size": 0.01,
147
+ "use_speaker_weighted_sampler": false,
148
+ "speaker_weighted_sampler_alpha": 1.0,
149
+ "use_language_weighted_sampler": false,
150
+ "language_weighted_sampler_alpha": 1.0,
151
+ "use_length_weighted_sampler": false,
152
+ "length_weighted_sampler_alpha": 1.0,
153
+ "base_model": "forward_tts",
154
+ "model_args": {
155
+ "num_chars": 130,
156
+ "out_channels": 80,
157
+ "hidden_channels": 384,
158
+ "use_aligner": true,
159
+ "use_pitch": true,
160
+ "pitch_predictor_hidden_channels": 256,
161
+ "pitch_predictor_kernel_size": 3,
162
+ "pitch_predictor_dropout_p": 0.1,
163
+ "pitch_embedding_kernel_size": 3,
164
+ "use_energy": false,
165
+ "energy_predictor_hidden_channels": 256,
166
+ "energy_predictor_kernel_size": 3,
167
+ "energy_predictor_dropout_p": 0.1,
168
+ "energy_embedding_kernel_size": 3,
169
+ "duration_predictor_hidden_channels": 256,
170
+ "duration_predictor_kernel_size": 3,
171
+ "duration_predictor_dropout_p": 0.1,
172
+ "positional_encoding": true,
173
+ "poisitonal_encoding_use_scale": true,
174
+ "length_scale": 1,
175
+ "encoder_type": "fftransformer",
176
+ "encoder_params": {
177
+ "hidden_channels_ffn": 1024,
178
+ "num_heads": 1,
179
+ "num_layers": 6,
180
+ "dropout_p": 0.1
181
+ },
182
+ "decoder_type": "fftransformer",
183
+ "decoder_params": {
184
+ "hidden_channels_ffn": 1024,
185
+ "num_heads": 1,
186
+ "num_layers": 6,
187
+ "dropout_p": 0.1
188
+ },
189
+ "detach_duration_predictor": false,
190
+ "max_duration": 75,
191
+ "num_speakers": 108,
192
+ "use_speaker_embedding": true,
193
+ "speakers_file": "/Users/drew/Library/Application Support/tts/tts_models--en--vctk--fast_pitch/speaker_ids.json",
194
+ "use_d_vector_file": false,
195
+ "d_vector_dim": 0,
196
+ "d_vector_file": null
197
+ },
198
+ "num_speakers": 0,
199
+ "speakers_file": "/Users/drew/Library/Application Support/tts/tts_models--en--vctk--fast_pitch/speaker_ids.json",
200
+ "use_speaker_embedding": true,
201
+ "use_d_vector_file": false,
202
+ "d_vector_file": false,
203
+ "d_vector_dim": 0,
204
+ "spec_loss_type": "mse",
205
+ "duration_loss_type": "mse",
206
+ "use_ssim_loss": true,
207
+ "ssim_loss_alpha": 1.0,
208
+ "spec_loss_alpha": 1.0,
209
+ "aligner_loss_alpha": 1.0,
210
+ "pitch_loss_alpha": 1.0,
211
+ "dur_loss_alpha": 1.0,
212
+ "binary_align_loss_alpha": 1.0,
213
+ "binary_loss_warmup_epochs": 150,
214
+ "min_seq_len": 13,
215
+ "max_seq_len": 500000,
216
+ "r": 1,
217
+ "f0_cache_path": "/home/ubuntu/TTS/recipes/vctk/fast_pitch/f0_cache"
218
+ }
local_coqui_tts_models/tts_models--en--vctk--fast_pitch/model_file.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5bd8f0b2bb7222be9859f308e0ec22223f875194ec17f70358a598d4c9a20f6
3
+ size 458564167
local_coqui_tts_models/tts_models--en--vctk--fast_pitch/speaker_ids.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "VCTK_p225": 0,
3
+ "VCTK_p226": 1,
4
+ "VCTK_p227": 2,
5
+ "VCTK_p228": 3,
6
+ "VCTK_p229": 4,
7
+ "VCTK_p230": 5,
8
+ "VCTK_p231": 6,
9
+ "VCTK_p232": 7,
10
+ "VCTK_p233": 8,
11
+ "VCTK_p234": 9,
12
+ "VCTK_p236": 10,
13
+ "VCTK_p237": 11,
14
+ "VCTK_p238": 12,
15
+ "VCTK_p239": 13,
16
+ "VCTK_p240": 14,
17
+ "VCTK_p241": 15,
18
+ "VCTK_p243": 16,
19
+ "VCTK_p244": 17,
20
+ "VCTK_p245": 18,
21
+ "VCTK_p246": 19,
22
+ "VCTK_p247": 20,
23
+ "VCTK_p248": 21,
24
+ "VCTK_p249": 22,
25
+ "VCTK_p250": 23,
26
+ "VCTK_p251": 24,
27
+ "VCTK_p252": 25,
28
+ "VCTK_p253": 26,
29
+ "VCTK_p254": 27,
30
+ "VCTK_p255": 28,
31
+ "VCTK_p256": 29,
32
+ "VCTK_p257": 30,
33
+ "VCTK_p258": 31,
34
+ "VCTK_p259": 32,
35
+ "VCTK_p260": 33,
36
+ "VCTK_p261": 34,
37
+ "VCTK_p262": 35,
38
+ "VCTK_p263": 36,
39
+ "VCTK_p264": 37,
40
+ "VCTK_p265": 38,
41
+ "VCTK_p266": 39,
42
+ "VCTK_p267": 40,
43
+ "VCTK_p268": 41,
44
+ "VCTK_p269": 42,
45
+ "VCTK_p270": 43,
46
+ "VCTK_p271": 44,
47
+ "VCTK_p272": 45,
48
+ "VCTK_p273": 46,
49
+ "VCTK_p274": 47,
50
+ "VCTK_p275": 48,
51
+ "VCTK_p276": 49,
52
+ "VCTK_p277": 50,
53
+ "VCTK_p278": 51,
54
+ "VCTK_p279": 52,
55
+ "VCTK_p280": 53,
56
+ "VCTK_p281": 54,
57
+ "VCTK_p282": 55,
58
+ "VCTK_p283": 56,
59
+ "VCTK_p284": 57,
60
+ "VCTK_p285": 58,
61
+ "VCTK_p286": 59,
62
+ "VCTK_p287": 60,
63
+ "VCTK_p288": 61,
64
+ "VCTK_p292": 62,
65
+ "VCTK_p293": 63,
66
+ "VCTK_p294": 64,
67
+ "VCTK_p295": 65,
68
+ "VCTK_p297": 66,
69
+ "VCTK_p298": 67,
70
+ "VCTK_p299": 68,
71
+ "VCTK_p300": 69,
72
+ "VCTK_p301": 70,
73
+ "VCTK_p302": 71,
74
+ "VCTK_p303": 72,
75
+ "VCTK_p304": 73,
76
+ "VCTK_p305": 74,
77
+ "VCTK_p306": 75,
78
+ "VCTK_p307": 76,
79
+ "VCTK_p308": 77,
80
+ "VCTK_p310": 78,
81
+ "VCTK_p311": 79,
82
+ "VCTK_p312": 80,
83
+ "VCTK_p313": 81,
84
+ "VCTK_p314": 82,
85
+ "VCTK_p316": 83,
86
+ "VCTK_p317": 84,
87
+ "VCTK_p318": 85,
88
+ "VCTK_p323": 86,
89
+ "VCTK_p326": 87,
90
+ "VCTK_p329": 88,
91
+ "VCTK_p330": 89,
92
+ "VCTK_p333": 90,
93
+ "VCTK_p334": 91,
94
+ "VCTK_p335": 92,
95
+ "VCTK_p336": 93,
96
+ "VCTK_p339": 94,
97
+ "VCTK_p340": 95,
98
+ "VCTK_p341": 96,
99
+ "VCTK_p343": 97,
100
+ "VCTK_p345": 98,
101
+ "VCTK_p347": 99,
102
+ "VCTK_p351": 100,
103
+ "VCTK_p360": 101,
104
+ "VCTK_p361": 102,
105
+ "VCTK_p362": 103,
106
+ "VCTK_p363": 104,
107
+ "VCTK_p364": 105,
108
+ "VCTK_p374": 106,
109
+ "VCTK_p376": 107
110
+ }
local_coqui_tts_models/tts_models--en--vctk--vits/config.json ADDED
@@ -0,0 +1,504 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": null,
3
+ "logger_uri": null,
4
+ "run_name": "",
5
+ "project_name": null,
6
+ "run_description": "",
7
+ "print_step": 25,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": null,
14
+ "save_step": 10000,
15
+ "save_n_checkpoints": 5,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 10000,
19
+ "target_loss": null,
20
+ "print_eval": false,
21
+ "test_delay_epochs": 0,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 10000,
29
+ "batch_size": null,
30
+ "eval_batch_size": null,
31
+ "grad_clip": [
32
+ 5.0,
33
+ 5.0
34
+ ],
35
+ "scheduler_after_epoch": true,
36
+ "lr": 0.001,
37
+ "optimizer": "AdamW",
38
+ "optimizer_params": {
39
+ "betas": [
40
+ 0.8,
41
+ 0.99
42
+ ],
43
+ "eps": 1e-09,
44
+ "weight_decay": 0.01
45
+ },
46
+ "lr_scheduler": "",
47
+ "lr_scheduler_params": {},
48
+ "use_grad_scaler": false,
49
+ "allow_tf32": false,
50
+ "cudnn_enable": true,
51
+ "cudnn_deterministic": false,
52
+ "cudnn_benchmark": true,
53
+ "training_seed": 54321,
54
+ "model": "vits",
55
+ "num_loader_workers": null,
56
+ "num_eval_loader_workers": 0,
57
+ "use_noise_augment": false,
58
+ "audio": {
59
+ "fft_size": 1024,
60
+ "sample_rate": 22050,
61
+ "win_length": 1024,
62
+ "hop_length": 256,
63
+ "num_mels": 80,
64
+ "mel_fmin": 0,
65
+ "mel_fmax": null
66
+ },
67
+ "use_phonemes": true,
68
+ "phonemizer": "espeak",
69
+ "phoneme_language": "en",
70
+ "compute_input_seq_cache": false,
71
+ "text_cleaner": "phoneme_cleaners",
72
+ "enable_eos_bos_chars": false,
73
+ "test_sentences_file": "",
74
+ "phoneme_cache_path": null,
75
+ "characters": {
76
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
77
+ "vocab_dict": null,
78
+ "pad": "_",
79
+ "eos": "",
80
+ "bos": "",
81
+ "blank": null,
82
+ "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
83
+ "punctuations": ";:,.!?\u00a1\u00bf\u2014\u2026\"\u00ab\u00bb\u201c\u201d ",
84
+ "phonemes": "\u0251\u0250\u0252\u00e6\u0253\u0299\u03b2\u0254\u0255\u00e7\u0257\u0256\u00f0\u02a4\u0259\u0258\u025a\u025b\u025c\u025d\u025e\u025f\u0284\u0261\u0260\u0262\u029b\u0266\u0267\u0127\u0265\u029c\u0268\u026a\u029d\u026d\u026c\u026b\u026e\u029f\u0271\u026f\u0270\u014b\u0273\u0272\u0274\u00f8\u0275\u0278\u03b8\u0153\u0276\u0298\u0279\u027a\u027e\u027b\u0280\u0281\u027d\u0282\u0283\u0288\u02a7\u0289\u028a\u028b\u2c71\u028c\u0263\u0264\u028d\u03c7\u028e\u028f\u0291\u0290\u0292\u0294\u02a1\u0295\u02a2\u01c0\u01c1\u01c2\u01c3\u02c8\u02cc\u02d0\u02d1\u02bc\u02b4\u02b0\u02b1\u02b2\u02b7\u02e0\u02e4\u02de\u2193\u2191\u2192\u2197\u2198'\u0329'\u1d7b",
85
+ "is_unique": true,
86
+ "is_sorted": true
87
+ },
88
+ "add_blank": true,
89
+ "batch_group_size": 0,
90
+ "loss_masking": null,
91
+ "min_audio_len": 1,
92
+ "max_audio_len": Infinity,
93
+ "min_text_len": 1,
94
+ "max_text_len": Infinity,
95
+ "compute_f0": false,
96
+ "compute_energy": false,
97
+ "compute_linear_spec": true,
98
+ "precompute_num_workers": 0,
99
+ "start_by_longest": false,
100
+ "shuffle": false,
101
+ "drop_last": false,
102
+ "datasets": [
103
+ {
104
+ "formatter": "",
105
+ "dataset_name": "",
106
+ "path": "",
107
+ "meta_file_train": "",
108
+ "ignored_speakers": null,
109
+ "language": "",
110
+ "phonemizer": "",
111
+ "meta_file_val": "",
112
+ "meta_file_attn_mask": ""
113
+ }
114
+ ],
115
+ "test_sentences": [
116
+ [
117
+ "I",
118
+ "t",
119
+ " ",
120
+ "t",
121
+ "o",
122
+ "o",
123
+ "k",
124
+ " ",
125
+ "m",
126
+ "e",
127
+ " ",
128
+ "q",
129
+ "u",
130
+ "i",
131
+ "t",
132
+ "e",
133
+ " ",
134
+ "a",
135
+ " ",
136
+ "l",
137
+ "o",
138
+ "n",
139
+ "g",
140
+ " ",
141
+ "t",
142
+ "i",
143
+ "m",
144
+ "e",
145
+ " ",
146
+ "t",
147
+ "o",
148
+ " ",
149
+ "d",
150
+ "e",
151
+ "v",
152
+ "e",
153
+ "l",
154
+ "o",
155
+ "p",
156
+ " ",
157
+ "a",
158
+ " ",
159
+ "v",
160
+ "o",
161
+ "i",
162
+ "c",
163
+ "e",
164
+ ",",
165
+ " ",
166
+ "a",
167
+ "n",
168
+ "d",
169
+ " ",
170
+ "n",
171
+ "o",
172
+ "w",
173
+ " ",
174
+ "t",
175
+ "h",
176
+ "a",
177
+ "t",
178
+ " ",
179
+ "I",
180
+ " ",
181
+ "h",
182
+ "a",
183
+ "v",
184
+ "e",
185
+ " ",
186
+ "i",
187
+ "t",
188
+ " ",
189
+ "I",
190
+ "'",
191
+ "m",
192
+ " ",
193
+ "n",
194
+ "o",
195
+ "t",
196
+ " ",
197
+ "g",
198
+ "o",
199
+ "i",
200
+ "n",
201
+ "g",
202
+ " ",
203
+ "t",
204
+ "o",
205
+ " ",
206
+ "b",
207
+ "e",
208
+ " ",
209
+ "s",
210
+ "i",
211
+ "l",
212
+ "e",
213
+ "n",
214
+ "t",
215
+ "."
216
+ ],
217
+ [
218
+ "B",
219
+ "e",
220
+ " ",
221
+ "a",
222
+ " ",
223
+ "v",
224
+ "o",
225
+ "i",
226
+ "c",
227
+ "e",
228
+ ",",
229
+ " ",
230
+ "n",
231
+ "o",
232
+ "t",
233
+ " ",
234
+ "a",
235
+ "n",
236
+ " ",
237
+ "e",
238
+ "c",
239
+ "h",
240
+ "o",
241
+ "."
242
+ ],
243
+ [
244
+ "I",
245
+ "'",
246
+ "m",
247
+ " ",
248
+ "s",
249
+ "o",
250
+ "r",
251
+ "r",
252
+ "y",
253
+ " ",
254
+ "D",
255
+ "a",
256
+ "v",
257
+ "e",
258
+ ".",
259
+ " ",
260
+ "I",
261
+ "'",
262
+ "m",
263
+ " ",
264
+ "a",
265
+ "f",
266
+ "r",
267
+ "a",
268
+ "i",
269
+ "d",
270
+ " ",
271
+ "I",
272
+ " ",
273
+ "c",
274
+ "a",
275
+ "n",
276
+ "'",
277
+ "t",
278
+ " ",
279
+ "d",
280
+ "o",
281
+ " ",
282
+ "t",
283
+ "h",
284
+ "a",
285
+ "t",
286
+ "."
287
+ ],
288
+ [
289
+ "T",
290
+ "h",
291
+ "i",
292
+ "s",
293
+ " ",
294
+ "c",
295
+ "a",
296
+ "k",
297
+ "e",
298
+ " ",
299
+ "i",
300
+ "s",
301
+ " ",
302
+ "g",
303
+ "r",
304
+ "e",
305
+ "a",
306
+ "t",
307
+ ".",
308
+ " ",
309
+ "I",
310
+ "t",
311
+ "'",
312
+ "s",
313
+ " ",
314
+ "s",
315
+ "o",
316
+ " ",
317
+ "d",
318
+ "e",
319
+ "l",
320
+ "i",
321
+ "c",
322
+ "i",
323
+ "o",
324
+ "u",
325
+ "s",
326
+ " ",
327
+ "a",
328
+ "n",
329
+ "d",
330
+ " ",
331
+ "m",
332
+ "o",
333
+ "i",
334
+ "s",
335
+ "t",
336
+ "."
337
+ ],
338
+ [
339
+ "P",
340
+ "r",
341
+ "i",
342
+ "o",
343
+ "r",
344
+ " ",
345
+ "t",
346
+ "o",
347
+ " ",
348
+ "N",
349
+ "o",
350
+ "v",
351
+ "e",
352
+ "m",
353
+ "b",
354
+ "e",
355
+ "r",
356
+ " ",
357
+ "2",
358
+ "2",
359
+ ",",
360
+ " ",
361
+ "1",
362
+ "9",
363
+ "6",
364
+ "3",
365
+ "."
366
+ ]
367
+ ],
368
+ "eval_split_max_size": null,
369
+ "eval_split_size": 0.01,
370
+ "use_speaker_weighted_sampler": false,
371
+ "speaker_weighted_sampler_alpha": 1.0,
372
+ "use_language_weighted_sampler": false,
373
+ "language_weighted_sampler_alpha": 1.0,
374
+ "use_length_weighted_sampler": false,
375
+ "length_weighted_sampler_alpha": 1.0,
376
+ "model_args": {
377
+ "num_chars": 179,
378
+ "out_channels": 513,
379
+ "spec_segment_size": 32,
380
+ "hidden_channels": 192,
381
+ "hidden_channels_ffn_text_encoder": 768,
382
+ "num_heads_text_encoder": 2,
383
+ "num_layers_text_encoder": 6,
384
+ "kernel_size_text_encoder": 3,
385
+ "dropout_p_text_encoder": 0.1,
386
+ "dropout_p_duration_predictor": 0.5,
387
+ "kernel_size_posterior_encoder": 5,
388
+ "dilation_rate_posterior_encoder": 1,
389
+ "num_layers_posterior_encoder": 16,
390
+ "kernel_size_flow": 5,
391
+ "dilation_rate_flow": 1,
392
+ "num_layers_flow": 4,
393
+ "resblock_type_decoder": "1",
394
+ "resblock_kernel_sizes_decoder": [
395
+ 3,
396
+ 7,
397
+ 11
398
+ ],
399
+ "resblock_dilation_sizes_decoder": [
400
+ [
401
+ 1,
402
+ 3,
403
+ 5
404
+ ],
405
+ [
406
+ 1,
407
+ 3,
408
+ 5
409
+ ],
410
+ [
411
+ 1,
412
+ 3,
413
+ 5
414
+ ]
415
+ ],
416
+ "upsample_rates_decoder": [
417
+ 8,
418
+ 8,
419
+ 2,
420
+ 2
421
+ ],
422
+ "upsample_initial_channel_decoder": 512,
423
+ "upsample_kernel_sizes_decoder": [
424
+ 16,
425
+ 16,
426
+ 4,
427
+ 4
428
+ ],
429
+ "periods_multi_period_discriminator": [
430
+ 2,
431
+ 3,
432
+ 5,
433
+ 7,
434
+ 11
435
+ ],
436
+ "use_sdp": true,
437
+ "noise_scale": 1.0,
438
+ "inference_noise_scale": 0.667,
439
+ "length_scale": 1.0,
440
+ "noise_scale_dp": 1.0,
441
+ "inference_noise_scale_dp": 0.8,
442
+ "max_inference_len": null,
443
+ "init_discriminator": false,
444
+ "use_spectral_norm_disriminator": false,
445
+ "use_speaker_embedding": true,
446
+ "num_speakers": 109,
447
+ "speakers_file": "/Users/drew/Library/Application Support/tts/tts_models--en--vctk--vits/speaker_ids.json",
448
+ "d_vector_file": null,
449
+ "speaker_embedding_channels": 256,
450
+ "use_d_vector_file": false,
451
+ "d_vector_dim": 0,
452
+ "detach_dp_input": true,
453
+ "use_language_embedding": false,
454
+ "embedded_language_dim": 4,
455
+ "num_languages": 0,
456
+ "language_ids_file": null,
457
+ "use_speaker_encoder_as_loss": false,
458
+ "speaker_encoder_config_path": "",
459
+ "speaker_encoder_model_path": "",
460
+ "condition_dp_on_speaker": true,
461
+ "freeze_encoder": false,
462
+ "freeze_DP": false,
463
+ "freeze_PE": false,
464
+ "freeze_flow_decoder": false,
465
+ "freeze_waveform_decoder": false,
466
+ "encoder_sample_rate": null,
467
+ "interpolate_z": true,
468
+ "reinit_DP": false,
469
+ "reinit_text_encoder": false
470
+ },
471
+ "lr_gen": 0.0002,
472
+ "lr_disc": 0.0002,
473
+ "lr_scheduler_gen": "ExponentialLR",
474
+ "lr_scheduler_gen_params": {
475
+ "gamma": 0.999875,
476
+ "last_epoch": -1
477
+ },
478
+ "lr_scheduler_disc": "ExponentialLR",
479
+ "lr_scheduler_disc_params": {
480
+ "gamma": 0.999875,
481
+ "last_epoch": -1
482
+ },
483
+ "kl_loss_alpha": 1.0,
484
+ "disc_loss_alpha": 1.0,
485
+ "gen_loss_alpha": 1.0,
486
+ "feat_loss_alpha": 1.0,
487
+ "mel_loss_alpha": 45.0,
488
+ "dur_loss_alpha": 1.0,
489
+ "speaker_encoder_loss_alpha": 1.0,
490
+ "return_wav": true,
491
+ "use_weighted_sampler": false,
492
+ "weighted_sampler_attrs": {},
493
+ "weighted_sampler_multipliers": {},
494
+ "r": 1,
495
+ "num_speakers": 0,
496
+ "use_speaker_embedding": false,
497
+ "speakers_file": "/Users/drew/Library/Application Support/tts/tts_models--en--vctk--vits/speaker_ids.json",
498
+ "speaker_embedding_channels": 256,
499
+ "language_ids_file": null,
500
+ "use_language_embedding": false,
501
+ "use_d_vector_file": false,
502
+ "d_vector_file": null,
503
+ "d_vector_dim": 0
504
+ }
local_coqui_tts_models/tts_models--en--vctk--vits/model_file.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbec6b420abcc677fe4a357994ee68f8f3b6fa84502e7accad42b11a79f6ad0d
3
+ size 159111821
local_coqui_tts_models/tts_models--en--vctk--vits/speaker_ids.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ED\n": 0,
3
+ "p225": 1,
4
+ "p226": 2,
5
+ "p227": 3,
6
+ "p228": 4,
7
+ "p229": 5,
8
+ "p230": 6,
9
+ "p231": 7,
10
+ "p232": 8,
11
+ "p233": 9,
12
+ "p234": 10,
13
+ "p236": 11,
14
+ "p237": 12,
15
+ "p238": 13,
16
+ "p239": 14,
17
+ "p240": 15,
18
+ "p241": 16,
19
+ "p243": 17,
20
+ "p244": 18,
21
+ "p245": 19,
22
+ "p246": 20,
23
+ "p247": 21,
24
+ "p248": 22,
25
+ "p249": 23,
26
+ "p250": 24,
27
+ "p251": 25,
28
+ "p252": 26,
29
+ "p253": 27,
30
+ "p254": 28,
31
+ "p255": 29,
32
+ "p256": 30,
33
+ "p257": 31,
34
+ "p258": 32,
35
+ "p259": 33,
36
+ "p260": 34,
37
+ "p261": 35,
38
+ "p262": 36,
39
+ "p263": 37,
40
+ "p264": 38,
41
+ "p265": 39,
42
+ "p266": 40,
43
+ "p267": 41,
44
+ "p268": 42,
45
+ "p269": 43,
46
+ "p270": 44,
47
+ "p271": 45,
48
+ "p272": 46,
49
+ "p273": 47,
50
+ "p274": 48,
51
+ "p275": 49,
52
+ "p276": 50,
53
+ "p277": 51,
54
+ "p278": 52,
55
+ "p279": 53,
56
+ "p280": 54,
57
+ "p281": 55,
58
+ "p282": 56,
59
+ "p283": 57,
60
+ "p284": 58,
61
+ "p285": 59,
62
+ "p286": 60,
63
+ "p287": 61,
64
+ "p288": 62,
65
+ "p292": 63,
66
+ "p293": 64,
67
+ "p294": 65,
68
+ "p295": 66,
69
+ "p297": 67,
70
+ "p298": 68,
71
+ "p299": 69,
72
+ "p300": 70,
73
+ "p301": 71,
74
+ "p302": 72,
75
+ "p303": 73,
76
+ "p304": 74,
77
+ "p305": 75,
78
+ "p306": 76,
79
+ "p307": 77,
80
+ "p308": 78,
81
+ "p310": 79,
82
+ "p311": 80,
83
+ "p312": 81,
84
+ "p313": 82,
85
+ "p314": 83,
86
+ "p316": 84,
87
+ "p317": 85,
88
+ "p318": 86,
89
+ "p323": 87,
90
+ "p326": 88,
91
+ "p329": 89,
92
+ "p330": 90,
93
+ "p333": 91,
94
+ "p334": 92,
95
+ "p335": 93,
96
+ "p336": 94,
97
+ "p339": 95,
98
+ "p340": 96,
99
+ "p341": 97,
100
+ "p343": 98,
101
+ "p345": 99,
102
+ "p347": 100,
103
+ "p351": 101,
104
+ "p360": 102,
105
+ "p361": 103,
106
+ "p362": 104,
107
+ "p363": 105,
108
+ "p364": 106,
109
+ "p374": 107,
110
+ "p376": 108
111
+ }
local_coqui_tts_models/tts_models--multilingual--multi-dataset--xtts_v2/.DS_Store ADDED
Binary file (6.15 kB). View file
 
local_coqui_tts_models/tts_models--multilingual--multi-dataset--xtts_v2/config.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output",
3
+ "logger_uri": null,
4
+ "run_name": "run",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 25,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": null,
14
+ "save_step": 10000,
15
+ "save_n_checkpoints": 5,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 10000,
19
+ "target_loss": null,
20
+ "print_eval": false,
21
+ "test_delay_epochs": 0,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 1000,
29
+ "batch_size": 32,
30
+ "eval_batch_size": 16,
31
+ "grad_clip": 0.0,
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.001,
34
+ "optimizer": "radam",
35
+ "optimizer_params": null,
36
+ "lr_scheduler": null,
37
+ "lr_scheduler_params": {},
38
+ "use_grad_scaler": false,
39
+ "allow_tf32": false,
40
+ "cudnn_enable": true,
41
+ "cudnn_deterministic": false,
42
+ "cudnn_benchmark": false,
43
+ "training_seed": 54321,
44
+ "model": "xtts",
45
+ "num_loader_workers": 0,
46
+ "num_eval_loader_workers": 0,
47
+ "use_noise_augment": false,
48
+ "audio": {
49
+ "sample_rate": 22050,
50
+ "output_sample_rate": 24000
51
+ },
52
+ "use_phonemes": false,
53
+ "phonemizer": null,
54
+ "phoneme_language": null,
55
+ "compute_input_seq_cache": false,
56
+ "text_cleaner": null,
57
+ "enable_eos_bos_chars": false,
58
+ "test_sentences_file": "",
59
+ "phoneme_cache_path": null,
60
+ "characters": null,
61
+ "add_blank": false,
62
+ "batch_group_size": 0,
63
+ "loss_masking": null,
64
+ "min_audio_len": 1,
65
+ "max_audio_len": Infinity,
66
+ "min_text_len": 1,
67
+ "max_text_len": Infinity,
68
+ "compute_f0": false,
69
+ "compute_energy": false,
70
+ "compute_linear_spec": false,
71
+ "precompute_num_workers": 0,
72
+ "start_by_longest": false,
73
+ "shuffle": false,
74
+ "drop_last": false,
75
+ "datasets": [
76
+ {
77
+ "formatter": "",
78
+ "dataset_name": "",
79
+ "path": "",
80
+ "meta_file_train": "",
81
+ "ignored_speakers": null,
82
+ "language": "",
83
+ "phonemizer": "",
84
+ "meta_file_val": "",
85
+ "meta_file_attn_mask": ""
86
+ }
87
+ ],
88
+ "test_sentences": [],
89
+ "eval_split_max_size": null,
90
+ "eval_split_size": 0.01,
91
+ "use_speaker_weighted_sampler": false,
92
+ "speaker_weighted_sampler_alpha": 1.0,
93
+ "use_language_weighted_sampler": false,
94
+ "language_weighted_sampler_alpha": 1.0,
95
+ "use_length_weighted_sampler": false,
96
+ "length_weighted_sampler_alpha": 1.0,
97
+ "model_args": {
98
+ "gpt_batch_size": 1,
99
+ "enable_redaction": false,
100
+ "kv_cache": true,
101
+ "gpt_checkpoint": null,
102
+ "clvp_checkpoint": null,
103
+ "decoder_checkpoint": null,
104
+ "num_chars": 255,
105
+ "tokenizer_file": "",
106
+ "gpt_max_audio_tokens": 605,
107
+ "gpt_max_text_tokens": 402,
108
+ "gpt_max_prompt_tokens": 70,
109
+ "gpt_layers": 30,
110
+ "gpt_n_model_channels": 1024,
111
+ "gpt_n_heads": 16,
112
+ "gpt_number_text_tokens": 6681,
113
+ "gpt_start_text_token": null,
114
+ "gpt_stop_text_token": null,
115
+ "gpt_num_audio_tokens": 1026,
116
+ "gpt_start_audio_token": 1024,
117
+ "gpt_stop_audio_token": 1025,
118
+ "gpt_code_stride_len": 1024,
119
+ "gpt_use_masking_gt_prompt_approach": true,
120
+ "gpt_use_perceiver_resampler": true,
121
+ "input_sample_rate": 22050,
122
+ "output_sample_rate": 24000,
123
+ "output_hop_length": 256,
124
+ "decoder_input_dim": 1024,
125
+ "d_vector_dim": 512,
126
+ "cond_d_vector_in_each_upsampling_layer": true,
127
+ "duration_const": 102400
128
+ },
129
+ "model_dir": null,
130
+ "languages": [
131
+ "en",
132
+ "es",
133
+ "fr",
134
+ "de",
135
+ "it",
136
+ "pt",
137
+ "pl",
138
+ "tr",
139
+ "ru",
140
+ "nl",
141
+ "cs",
142
+ "ar",
143
+ "zh-cn",
144
+ "hu",
145
+ "ko",
146
+ "ja",
147
+ "hi"
148
+ ],
149
+ "temperature": 0.75,
150
+ "length_penalty": 1.0,
151
+ "repetition_penalty": 5.0,
152
+ "top_k": 50,
153
+ "top_p": 0.85,
154
+ "num_gpt_outputs": 1,
155
+ "gpt_cond_len": 30,
156
+ "gpt_cond_chunk_len": 4,
157
+ "max_ref_len": 30,
158
+ "sound_norm_refs": false
159
+ }
local_coqui_tts_models/tts_models--multilingual--multi-dataset--xtts_v2/hash.md5 ADDED
@@ -0,0 +1 @@
 
 
1
+ 10f92b55c512af7a8d39d650547a15a7
local_coqui_tts_models/tts_models--multilingual--multi-dataset--xtts_v2/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7ea20001c6a0a841c77e252d8409f6a74fb423e79b3206a0771ba5989776187
3
+ size 1867929118
local_coqui_tts_models/tts_models--multilingual--multi-dataset--xtts_v2/tos_agreed.txt ADDED
File without changes
local_coqui_tts_models/tts_models--multilingual--multi-dataset--xtts_v2/vocab.json ADDED
The diff for this file is too large to render. See raw diff