azamat commited on
Commit
6127b48
·
1 Parent(s): 1f948e1
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. TTS/.models.json +500 -0
  2. TTS/__init__.py +6 -0
  3. TTS/bin/__init__.py +0 -0
  4. TTS/bin/collect_env_info.py +48 -0
  5. TTS/bin/compute_attention_masks.py +165 -0
  6. TTS/bin/compute_embeddings.py +84 -0
  7. TTS/bin/compute_statistics.py +96 -0
  8. TTS/bin/eval_encoder.py +89 -0
  9. TTS/bin/extract_tts_spectrograms.py +287 -0
  10. TTS/bin/find_unique_chars.py +45 -0
  11. TTS/bin/find_unique_phonemes.py +70 -0
  12. TTS/bin/remove_silence_using_vad.py +85 -0
  13. TTS/bin/resample.py +87 -0
  14. TTS/bin/synthesize.py +425 -0
  15. TTS/bin/train_encoder.py +319 -0
  16. TTS/bin/train_tts.py +71 -0
  17. TTS/bin/train_vocoder.py +77 -0
  18. TTS/bin/tune_wavegrad.py +100 -0
  19. TTS/config/__init__.py +132 -0
  20. TTS/config/shared_configs.py +260 -0
  21. TTS/encoder/README.md +18 -0
  22. TTS/encoder/__init__.py +0 -0
  23. TTS/encoder/configs/base_encoder_config.py +61 -0
  24. TTS/encoder/configs/emotion_encoder_config.py +12 -0
  25. TTS/encoder/configs/speaker_encoder_config.py +11 -0
  26. TTS/encoder/dataset.py +147 -0
  27. TTS/encoder/losses.py +226 -0
  28. TTS/encoder/models/base_encoder.py +154 -0
  29. TTS/encoder/models/lstm.py +99 -0
  30. TTS/encoder/models/resnet.py +200 -0
  31. TTS/encoder/requirements.txt +2 -0
  32. TTS/encoder/utils/__init__.py +0 -0
  33. TTS/encoder/utils/generic_utils.py +184 -0
  34. TTS/encoder/utils/io.py +38 -0
  35. TTS/encoder/utils/prepare_voxceleb.py +219 -0
  36. TTS/encoder/utils/samplers.py +114 -0
  37. TTS/encoder/utils/training.py +99 -0
  38. TTS/encoder/utils/visual.py +50 -0
  39. TTS/model.py +56 -0
  40. TTS/server/README.md +18 -0
  41. TTS/server/__init__.py +0 -0
  42. TTS/server/conf.json +12 -0
  43. TTS/server/server.py +190 -0
  44. TTS/server/static/coqui-log-green-TTS.png +0 -0
  45. TTS/server/templates/details.html +131 -0
  46. TTS/server/templates/index.html +143 -0
  47. TTS/tts/__init__.py +0 -0
  48. TTS/tts/configs/__init__.py +17 -0
  49. TTS/tts/configs/align_tts_config.py +107 -0
  50. TTS/tts/configs/fast_pitch_config.py +182 -0
TTS/.models.json ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tts_models": {
3
+ "multilingual":{
4
+ "multi-dataset":{
5
+ "your_tts":{
6
+ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
7
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
8
+ "default_vocoder": null,
9
+ "commit": "e9a1953e",
10
+ "license": "CC BY-NC-ND 4.0",
11
+ "contact": "[email protected]"
12
+ }
13
+ }
14
+ },
15
+ "en": {
16
+ "ek1": {
17
+ "tacotron2": {
18
+ "description": "EK1 en-rp tacotron2 by NMStoker",
19
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
20
+ "default_vocoder": "vocoder_models/en/ek1/wavegrad",
21
+ "commit": "c802255",
22
+ "license": "apache 2.0"
23
+ }
24
+ },
25
+ "ljspeech": {
26
+ "tacotron2-DDC": {
27
+ "description": "Tacotron2 with Double Decoder Consistency.",
28
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
29
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
30
+ "commit": "bae2ad0f",
31
+ "author": "Eren Gölge @erogol",
32
+ "license": "apache 2.0",
33
+ "contact": "[email protected]"
34
+ },
35
+ "tacotron2-DDC_ph": {
36
+ "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
37
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
38
+ "default_vocoder": "vocoder_models/en/ljspeech/univnet",
39
+ "commit": "3900448",
40
+ "author": "Eren Gölge @erogol",
41
+ "license": "apache 2.0",
42
+ "contact": "[email protected]"
43
+ },
44
+ "glow-tts": {
45
+ "description": "",
46
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
47
+ "stats_file": null,
48
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
49
+ "commit": "",
50
+ "author": "Eren Gölge @erogol",
51
+ "license": "MPL",
52
+ "contact": "[email protected]"
53
+ },
54
+ "speedy-speech": {
55
+ "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
56
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
57
+ "stats_file": null,
58
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
59
+ "commit": "4581e3d",
60
+ "author": "Eren Gölge @erogol",
61
+ "license": "apache 2.0",
62
+ "contact": "[email protected]"
63
+ },
64
+ "tacotron2-DCA": {
65
+ "description": "",
66
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
67
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
68
+ "commit": "",
69
+ "author": "Eren Gölge @erogol",
70
+ "license": "MPL",
71
+ "contact": "[email protected]"
72
+ },
73
+ "vits": {
74
+ "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
75
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
76
+ "default_vocoder": null,
77
+ "commit": "3900448",
78
+ "author": "Eren Gölge @erogol",
79
+ "license": "apache 2.0",
80
+ "contact": "[email protected]"
81
+ },
82
+ "fast_pitch": {
83
+ "description": "FastPitch model trained on LJSpeech using the Aligner Network",
84
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
85
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
86
+ "commit": "b27b3ba",
87
+ "author": "Eren Gölge @erogol",
88
+ "license": "apache 2.0",
89
+ "contact": "[email protected]"
90
+ }
91
+ },
92
+ "vctk": {
93
+ "vits": {
94
+ "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
95
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
96
+ "default_vocoder": null,
97
+ "commit": "3900448",
98
+ "author": "Eren @erogol",
99
+ "license": "apache 2.0",
100
+ "contact": "[email protected]"
101
+ },
102
+ "fast_pitch":{
103
+ "description": "FastPitch model trained on VCTK dataseset.",
104
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
105
+ "default_vocoder": null,
106
+ "commit": "bdab788d",
107
+ "author": "Eren @erogol",
108
+ "license": "CC BY-NC-ND 4.0",
109
+ "contact": "[email protected]"
110
+ }
111
+ },
112
+ "sam": {
113
+ "tacotron-DDC": {
114
+ "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
115
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
116
+ "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
117
+ "commit": "bae2ad0f",
118
+ "author": "Eren Gölge @erogol",
119
+ "license": "apache 2.0",
120
+ "contact": "[email protected]"
121
+ }
122
+ },
123
+ "blizzard2013": {
124
+ "capacitron-t2-c50": {
125
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
126
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
127
+ "commit": "d6284e7",
128
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
129
+ "author": "Adam Froghyar @a-froghyar",
130
+ "license": "apache 2.0",
131
+ "contact": "[email protected]"
132
+ },
133
+ "capacitron-t2-c150": {
134
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
135
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip",
136
+ "commit": "d6284e7",
137
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
138
+ "author": "Adam Froghyar @a-froghyar",
139
+ "license": "apache 2.0",
140
+ "contact": "[email protected]"
141
+ }
142
+ }
143
+ },
144
+ "es": {
145
+ "mai": {
146
+ "tacotron2-DDC": {
147
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
148
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
149
+ "commit": "",
150
+ "author": "Eren Gölge @erogol",
151
+ "license": "MPL",
152
+ "contact": "[email protected]"
153
+ }
154
+ }
155
+ },
156
+ "fr": {
157
+ "mai": {
158
+ "tacotron2-DDC": {
159
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
160
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
161
+ "commit": "",
162
+ "author": "Eren Gölge @erogol",
163
+ "license": "MPL",
164
+ "contact": "[email protected]"
165
+ }
166
+ }
167
+ },
168
+ "uk":{
169
+ "mai": {
170
+ "glow-tts": {
171
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
172
+ "author":"@robinhad",
173
+ "commit": "bdab788d",
174
+ "license": "MIT",
175
+ "contact": "",
176
+ "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
177
+ }
178
+ }
179
+ },
180
+ "zh-CN": {
181
+ "baker": {
182
+ "tacotron2-DDC-GST": {
183
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
184
+ "commit": "unknown",
185
+ "author": "@kirianguiller",
186
+ "license": "apache 2.0",
187
+ "default_vocoder": null
188
+ }
189
+ }
190
+ },
191
+ "nl": {
192
+ "mai": {
193
+ "tacotron2-DDC": {
194
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
195
+ "author": "@r-dh",
196
+ "license": "apache 2.0",
197
+ "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
198
+ "stats_file": null,
199
+ "commit": "540d811"
200
+ }
201
+ }
202
+ },
203
+ "de": {
204
+ "thorsten": {
205
+ "tacotron2-DCA": {
206
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
207
+ "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
208
+ "author": "@thorstenMueller",
209
+ "license": "apache 2.0",
210
+ "commit": "unknown"
211
+ },
212
+ "vits": {
213
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
214
+ "default_vocoder": null,
215
+ "author": "@thorstenMueller",
216
+ "license": "apache 2.0",
217
+ "commit": "unknown"
218
+ }
219
+ }
220
+ },
221
+ "ja": {
222
+ "kokoro": {
223
+ "tacotron2-DDC": {
224
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
225
+ "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
226
+ "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
227
+ "author": "@kaiidams",
228
+ "license": "apache 2.0",
229
+ "commit": "401fbd89"
230
+ }
231
+ }
232
+ },
233
+ "tr":{
234
+ "common-voice": {
235
+ "glow-tts":{
236
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
237
+ "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
238
+ "license": "MIT",
239
+ "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
240
+ "author": "Fatih Akademi",
241
+ "commit": null
242
+ }
243
+ }
244
+ },
245
+ "it": {
246
+ "mai_female": {
247
+ "glow-tts":{
248
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
249
+ "default_vocoder": null,
250
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
251
+ "author": "@nicolalandro",
252
+ "license": "apache 2.0",
253
+ "commit": null
254
+ },
255
+ "vits":{
256
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
257
+ "default_vocoder": null,
258
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
259
+ "author": "@nicolalandro",
260
+ "license": "apache 2.0",
261
+ "commit": null
262
+ }
263
+ },
264
+ "mai_male": {
265
+ "glow-tts":{
266
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
267
+ "default_vocoder": null,
268
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
269
+ "author": "@nicolalandro",
270
+ "license": "apache 2.0",
271
+ "commit": null
272
+ },
273
+ "vits":{
274
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
275
+ "default_vocoder": null,
276
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
277
+ "author": "@nicolalandro",
278
+ "license": "apache 2.0",
279
+ "commit": null
280
+ }
281
+ }
282
+ },
283
+ "ewe": {
284
+ "openbible": {
285
+ "vits":{
286
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
287
+ "default_vocoder": null,
288
+ "license": "CC-BY-SA 4.0",
289
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
290
+ "author": "@coqui_ai",
291
+ "commit": "1b22f03"
292
+ }
293
+ }
294
+ },
295
+ "hau": {
296
+ "openbible": {
297
+ "vits":{
298
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
299
+ "default_vocoder": null,
300
+ "license": "CC-BY-SA 4.0",
301
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
302
+ "author": "@coqui_ai",
303
+ "commit": "1b22f03"
304
+ }
305
+ }
306
+ },
307
+ "lin": {
308
+ "openbible": {
309
+ "vits":{
310
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
311
+ "default_vocoder": null,
312
+ "license": "CC-BY-SA 4.0",
313
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
314
+ "author": "@coqui_ai",
315
+ "commit": "1b22f03"
316
+ }
317
+ }
318
+ },
319
+ "tw_akuapem": {
320
+ "openbible": {
321
+ "vits":{
322
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
323
+ "default_vocoder": null,
324
+ "license": "CC-BY-SA 4.0",
325
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
326
+ "author": "@coqui_ai",
327
+ "commit": "1b22f03"
328
+ }
329
+ }
330
+ },
331
+ "tw_asante": {
332
+ "openbible": {
333
+ "vits":{
334
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
335
+ "default_vocoder": null,
336
+ "license": "CC-BY-SA 4.0",
337
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
338
+ "author": "@coqui_ai",
339
+ "commit": "1b22f03"
340
+ }
341
+ }
342
+ },
343
+ "yor": {
344
+ "openbible": {
345
+ "vits":{
346
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
347
+ "default_vocoder": null,
348
+ "license": "CC-BY-SA 4.0",
349
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
350
+ "author": "@coqui_ai",
351
+ "commit": "1b22f03"
352
+ }
353
+ }
354
+ }
355
+ },
356
+ "vocoder_models": {
357
+ "universal": {
358
+ "libri-tts": {
359
+ "wavegrad": {
360
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
361
+ "commit": "ea976b0",
362
+ "author": "Eren Gölge @erogol",
363
+ "license": "MPL",
364
+ "contact": "[email protected]"
365
+ },
366
+ "fullband-melgan": {
367
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
368
+ "commit": "4132240",
369
+ "author": "Eren Gölge @erogol",
370
+ "license": "MPL",
371
+ "contact": "[email protected]"
372
+ }
373
+ }
374
+ },
375
+ "en": {
376
+ "ek1": {
377
+ "wavegrad": {
378
+ "description": "EK1 en-rp wavegrad by NMStoker",
379
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
380
+ "commit": "c802255",
381
+ "license": "apache 2.0"
382
+ }
383
+ },
384
+ "ljspeech": {
385
+ "multiband-melgan": {
386
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
387
+ "commit": "ea976b0",
388
+ "author": "Eren Gölge @erogol",
389
+ "license": "MPL",
390
+ "contact": "[email protected]"
391
+ },
392
+ "hifigan_v2": {
393
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
394
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
395
+ "commit": "bae2ad0f",
396
+ "author": "@erogol",
397
+ "license": "apache 2.0",
398
+ "contact": "[email protected]"
399
+ },
400
+ "univnet": {
401
+ "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
402
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
403
+ "commit": "4581e3d",
404
+ "author": "Eren @erogol",
405
+ "license": "apache 2.0",
406
+ "contact": "[email protected]"
407
+ }
408
+ },
409
+ "blizzard2013": {
410
+ "hifigan_v2": {
411
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
412
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
413
+ "commit": "d6284e7",
414
+ "author": "Adam Froghyar @a-froghyar",
415
+ "license": "apache 2.0",
416
+ "contact": "[email protected]"
417
+ }
418
+ },
419
+ "vctk": {
420
+ "hifigan_v2": {
421
+ "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
422
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
423
+ "commit": "2f07160",
424
+ "author": "Edresson Casanova",
425
+ "license": "apache 2.0",
426
+ "contact": ""
427
+ }
428
+ },
429
+ "sam": {
430
+ "hifigan_v2": {
431
+ "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
432
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
433
+ "commit": "2f07160",
434
+ "author": "Eren Gölge @erogol",
435
+ "license": "apache 2.0",
436
+ "contact": "[email protected]"
437
+ }
438
+ }
439
+ },
440
+ "nl": {
441
+ "mai": {
442
+ "parallel-wavegan": {
443
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
444
+ "author": "@r-dh",
445
+ "license": "apache 2.0",
446
+ "commit": "unknown"
447
+ }
448
+ }
449
+ },
450
+ "de": {
451
+ "thorsten": {
452
+ "wavegrad": {
453
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
454
+ "author": "@thorstenMueller",
455
+ "license": "apache 2.0",
456
+ "commit": "unknown"
457
+ },
458
+ "fullband-melgan": {
459
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
460
+ "author": "@thorstenMueller",
461
+ "license": "apache 2.0",
462
+ "commit": "unknown"
463
+ }
464
+ }
465
+ },
466
+ "ja": {
467
+ "kokoro": {
468
+ "hifigan_v1": {
469
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
470
+ "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
471
+ "author": "@kaiidams",
472
+ "license": "apache 2.0",
473
+ "commit": "3900448"
474
+ }
475
+ }
476
+ },
477
+ "uk": {
478
+ "mai": {
479
+ "multiband-melgan": {
480
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
481
+ "author":"@robinhad",
482
+ "commit": "bdab788d",
483
+ "license": "MIT",
484
+ "contact": ""
485
+ }
486
+ }
487
+ },
488
+ "tr":{
489
+ "common-voice": {
490
+ "hifigan":{
491
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
492
+ "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
493
+ "author": "Fatih Akademi",
494
+ "license": "MIT",
495
+ "commit": null
496
+ }
497
+ }
498
+ }
499
+ }
500
+ }
TTS/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
4
+ version = f.read().strip()
5
+
6
+ __version__ = version
TTS/bin/__init__.py ADDED
File without changes
TTS/bin/collect_env_info.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Get detailed info about the working environment."""
2
+ import os
3
+ import platform
4
+ import sys
5
+
6
+ import numpy
7
+ import torch
8
+
9
+ sys.path += [os.path.abspath(".."), os.path.abspath(".")]
10
+ import json
11
+
12
+ import TTS
13
+
14
+
15
+ def system_info():
16
+ return {
17
+ "OS": platform.system(),
18
+ "architecture": platform.architecture(),
19
+ "version": platform.version(),
20
+ "processor": platform.processor(),
21
+ "python": platform.python_version(),
22
+ }
23
+
24
+
25
+ def cuda_info():
26
+ return {
27
+ "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
28
+ "available": torch.cuda.is_available(),
29
+ "version": torch.version.cuda,
30
+ }
31
+
32
+
33
+ def package_info():
34
+ return {
35
+ "numpy": numpy.__version__,
36
+ "PyTorch_version": torch.__version__,
37
+ "PyTorch_debug": torch.version.debug,
38
+ "TTS": TTS.__version__,
39
+ }
40
+
41
+
42
+ def main():
43
+ details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
44
+ print(json.dumps(details, indent=4, sort_keys=True))
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
TTS/bin/compute_attention_masks.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import importlib
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch.utils.data import DataLoader
9
+ from tqdm import tqdm
10
+
11
+ from TTS.config import load_config
12
+ from TTS.tts.datasets.TTSDataset import TTSDataset
13
+ from TTS.tts.models import setup_model
14
+ from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
15
+ from TTS.utils.audio import AudioProcessor
16
+ from TTS.utils.io import load_checkpoint
17
+
18
+ if __name__ == "__main__":
19
+ # pylint: disable=bad-option-value
20
+ parser = argparse.ArgumentParser(
21
+ description="""Extract attention masks from trained Tacotron/Tacotron2 models.
22
+ These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
23
+ """Each attention mask is written to the same path as the input wav file with ".npy" file extension.
24
+ (e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
25
+ """
26
+ Example run:
27
+ CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
28
+ --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
29
+ --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
30
+ --dataset_metafile metadata.csv
31
+ --data_path /root/LJSpeech-1.1/
32
+ --batch_size 32
33
+ --dataset ljspeech
34
+ --use_cuda True
35
+ """,
36
+ formatter_class=RawTextHelpFormatter,
37
+ )
38
+ parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
39
+ parser.add_argument(
40
+ "--config_path",
41
+ type=str,
42
+ required=True,
43
+ help="Path to Tacotron/Tacotron2 config file.",
44
+ )
45
+ parser.add_argument(
46
+ "--dataset",
47
+ type=str,
48
+ default="",
49
+ required=True,
50
+ help="Target dataset processor name from TTS.tts.dataset.preprocess.",
51
+ )
52
+
53
+ parser.add_argument(
54
+ "--dataset_metafile",
55
+ type=str,
56
+ default="",
57
+ required=True,
58
+ help="Dataset metafile inclusing file paths with transcripts.",
59
+ )
60
+ parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
61
+ parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
62
+
63
+ parser.add_argument(
64
+ "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
65
+ )
66
+ args = parser.parse_args()
67
+
68
+ C = load_config(args.config_path)
69
+ ap = AudioProcessor(**C.audio)
70
+
71
+ # if the vocabulary was passed, replace the default
72
+ if "characters" in C.keys():
73
+ symbols, phonemes = make_symbols(**C.characters)
74
+
75
+ # load the model
76
+ num_chars = len(phonemes) if C.use_phonemes else len(symbols)
77
+ # TODO: handle multi-speaker
78
+ model = setup_model(C)
79
+ model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
80
+
81
+ # data loader
82
+ preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
83
+ preprocessor = getattr(preprocessor, args.dataset)
84
+ meta_data = preprocessor(args.data_path, args.dataset_metafile)
85
+ dataset = TTSDataset(
86
+ model.decoder.r,
87
+ C.text_cleaner,
88
+ compute_linear_spec=False,
89
+ ap=ap,
90
+ meta_data=meta_data,
91
+ characters=C.characters if "characters" in C.keys() else None,
92
+ add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
93
+ use_phonemes=C.use_phonemes,
94
+ phoneme_cache_path=C.phoneme_cache_path,
95
+ phoneme_language=C.phoneme_language,
96
+ enable_eos_bos=C.enable_eos_bos_chars,
97
+ )
98
+
99
+ dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
100
+ loader = DataLoader(
101
+ dataset,
102
+ batch_size=args.batch_size,
103
+ num_workers=4,
104
+ collate_fn=dataset.collate_fn,
105
+ shuffle=False,
106
+ drop_last=False,
107
+ )
108
+
109
+ # compute attentions
110
+ file_paths = []
111
+ with torch.no_grad():
112
+ for data in tqdm(loader):
113
+ # setup input data
114
+ text_input = data[0]
115
+ text_lengths = data[1]
116
+ linear_input = data[3]
117
+ mel_input = data[4]
118
+ mel_lengths = data[5]
119
+ stop_targets = data[6]
120
+ item_idxs = data[7]
121
+
122
+ # dispatch data to GPU
123
+ if args.use_cuda:
124
+ text_input = text_input.cuda()
125
+ text_lengths = text_lengths.cuda()
126
+ mel_input = mel_input.cuda()
127
+ mel_lengths = mel_lengths.cuda()
128
+
129
+ model_outputs = model.forward(text_input, text_lengths, mel_input)
130
+
131
+ alignments = model_outputs["alignments"].detach()
132
+ for idx, alignment in enumerate(alignments):
133
+ item_idx = item_idxs[idx]
134
+ # interpolate if r > 1
135
+ alignment = (
136
+ torch.nn.functional.interpolate(
137
+ alignment.transpose(0, 1).unsqueeze(0),
138
+ size=None,
139
+ scale_factor=model.decoder.r,
140
+ mode="nearest",
141
+ align_corners=None,
142
+ recompute_scale_factor=None,
143
+ )
144
+ .squeeze(0)
145
+ .transpose(0, 1)
146
+ )
147
+ # remove paddings
148
+ alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
149
+ # set file paths
150
+ wav_file_name = os.path.basename(item_idx)
151
+ align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
152
+ file_path = item_idx.replace(wav_file_name, align_file_name)
153
+ # save output
154
+ wav_file_abs_path = os.path.abspath(item_idx)
155
+ file_abs_path = os.path.abspath(file_path)
156
+ file_paths.append([wav_file_abs_path, file_abs_path])
157
+ np.save(file_path, alignment)
158
+
159
+ # ourput metafile
160
+ metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
161
+
162
+ with open(metafile, "w", encoding="utf-8") as f:
163
+ for p in file_paths:
164
+ f.write(f"{p[0]}|{p[1]}\n")
165
+ print(f" >> Metafile created: {metafile}")
TTS/bin/compute_embeddings.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ import torch
6
+ from tqdm import tqdm
7
+
8
+ from TTS.config import load_config
9
+ from TTS.tts.datasets import load_tts_samples
10
+ from TTS.tts.utils.managers import save_file
11
+ from TTS.tts.utils.speakers import SpeakerManager
12
+
13
+ parser = argparse.ArgumentParser(
14
+ description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
15
+ """
16
+ Example runs:
17
+ python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json dataset_config.json
18
+ """,
19
+ formatter_class=RawTextHelpFormatter,
20
+ )
21
+ parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
22
+ parser.add_argument("config_path", type=str, help="Path to model config file.")
23
+ parser.add_argument("config_dataset_path", type=str, help="Path to dataset config file.")
24
+ parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
25
+ parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
26
+ parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
27
+ parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
28
+
29
+ args = parser.parse_args()
30
+
31
+ use_cuda = torch.cuda.is_available() and not args.disable_cuda
32
+
33
+ c_dataset = load_config(args.config_dataset_path)
34
+
35
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
36
+
37
+ if meta_data_eval is None:
38
+ wav_files = meta_data_train
39
+ else:
40
+ wav_files = meta_data_train + meta_data_eval
41
+
42
+ encoder_manager = SpeakerManager(
43
+ encoder_model_path=args.model_path,
44
+ encoder_config_path=args.config_path,
45
+ d_vectors_file_path=args.old_file,
46
+ use_cuda=use_cuda,
47
+ )
48
+
49
+ class_name_key = encoder_manager.encoder_config.class_name_key
50
+
51
+ # compute speaker embeddings
52
+ speaker_mapping = {}
53
+ for idx, wav_file in enumerate(tqdm(wav_files)):
54
+ if isinstance(wav_file, dict):
55
+ class_name = wav_file[class_name_key]
56
+ wav_file = wav_file["audio_file"]
57
+ else:
58
+ class_name = None
59
+
60
+ wav_file_name = os.path.basename(wav_file)
61
+ if args.old_file is not None and wav_file_name in encoder_manager.clip_ids:
62
+ # get the embedding from the old file
63
+ embedd = encoder_manager.get_embedding_by_clip(wav_file_name)
64
+ else:
65
+ # extract the embedding
66
+ embedd = encoder_manager.compute_embedding_from_clip(wav_file)
67
+
68
+ # create speaker_mapping if target dataset is defined
69
+ speaker_mapping[wav_file_name] = {}
70
+ speaker_mapping[wav_file_name]["name"] = class_name
71
+ speaker_mapping[wav_file_name]["embedding"] = embedd
72
+
73
+ if speaker_mapping:
74
+ # save speaker_mapping if target dataset is defined
75
+ if os.path.isdir(args.output_path):
76
+ mapping_file_path = os.path.join(args.output_path, "speakers.pth")
77
+ else:
78
+ mapping_file_path = args.output_path
79
+
80
+ if os.path.dirname(mapping_file_path) != "":
81
+ os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
82
+
83
+ save_file(speaker_mapping, mapping_file_path)
84
+ print("Speaker embeddings saved at:", mapping_file_path)
TTS/bin/compute_statistics.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import glob
6
+ import os
7
+
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+
11
+ # from TTS.utils.io import load_config
12
+ from TTS.config import load_config
13
+ from TTS.tts.datasets import load_tts_samples
14
+ from TTS.utils.audio import AudioProcessor
15
+
16
+
17
+ def main():
18
+ """Run preprocessing process."""
19
+ parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
20
+ parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
21
+ parser.add_argument("out_path", type=str, help="save path (directory and filename).")
22
+ parser.add_argument(
23
+ "--data_path",
24
+ type=str,
25
+ required=False,
26
+ help="folder including the target set of wavs overriding dataset config.",
27
+ )
28
+ args, overrides = parser.parse_known_args()
29
+
30
+ CONFIG = load_config(args.config_path)
31
+ CONFIG.parse_known_args(overrides, relaxed_parser=True)
32
+
33
+ # load config
34
+ CONFIG.audio.signal_norm = False # do not apply earlier normalization
35
+ CONFIG.audio.stats_path = None # discard pre-defined stats
36
+
37
+ # load audio processor
38
+ ap = AudioProcessor(**CONFIG.audio.to_dict())
39
+
40
+ # load the meta data of target dataset
41
+ if args.data_path:
42
+ dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
43
+ else:
44
+ dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
45
+ print(f" > There are {len(dataset_items)} files.")
46
+
47
+ mel_sum = 0
48
+ mel_square_sum = 0
49
+ linear_sum = 0
50
+ linear_square_sum = 0
51
+ N = 0
52
+ for item in tqdm(dataset_items):
53
+ # compute features
54
+ wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
55
+ linear = ap.spectrogram(wav)
56
+ mel = ap.melspectrogram(wav)
57
+
58
+ # compute stats
59
+ N += mel.shape[1]
60
+ mel_sum += mel.sum(1)
61
+ linear_sum += linear.sum(1)
62
+ mel_square_sum += (mel**2).sum(axis=1)
63
+ linear_square_sum += (linear**2).sum(axis=1)
64
+
65
+ mel_mean = mel_sum / N
66
+ mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
67
+ linear_mean = linear_sum / N
68
+ linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
69
+
70
+ output_file_path = args.out_path
71
+ stats = {}
72
+ stats["mel_mean"] = mel_mean
73
+ stats["mel_std"] = mel_scale
74
+ stats["linear_mean"] = linear_mean
75
+ stats["linear_std"] = linear_scale
76
+
77
+ print(f" > Avg mel spec mean: {mel_mean.mean()}")
78
+ print(f" > Avg mel spec scale: {mel_scale.mean()}")
79
+ print(f" > Avg linear spec mean: {linear_mean.mean()}")
80
+ print(f" > Avg linear spec scale: {linear_scale.mean()}")
81
+
82
+ # set default config values for mean-var scaling
83
+ CONFIG.audio.stats_path = output_file_path
84
+ CONFIG.audio.signal_norm = True
85
+ # remove redundant values
86
+ del CONFIG.audio.max_norm
87
+ del CONFIG.audio.min_level_db
88
+ del CONFIG.audio.symmetric_norm
89
+ del CONFIG.audio.clip_norm
90
+ stats["audio_config"] = CONFIG.audio.to_dict()
91
+ np.save(output_file_path, stats, allow_pickle=True)
92
+ print(f" > stats saved to {output_file_path}")
93
+
94
+
95
+ if __name__ == "__main__":
96
+ main()
TTS/bin/eval_encoder.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from argparse import RawTextHelpFormatter
3
+
4
+ import torch
5
+ from tqdm import tqdm
6
+
7
+ from TTS.config import load_config
8
+ from TTS.tts.datasets import load_tts_samples
9
+ from TTS.tts.utils.speakers import SpeakerManager
10
+
11
+
12
+ def compute_encoder_accuracy(dataset_items, encoder_manager):
13
+
14
+ class_name_key = encoder_manager.encoder_config.class_name_key
15
+ map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
16
+
17
+ class_acc_dict = {}
18
+
19
+ # compute embeddings for all wav_files
20
+ for item in tqdm(dataset_items):
21
+ class_name = item[class_name_key]
22
+ wav_file = item["audio_file"]
23
+
24
+ # extract the embedding
25
+ embedd = encoder_manager.compute_embedding_from_clip(wav_file)
26
+ if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
27
+ embedding = torch.FloatTensor(embedd).unsqueeze(0)
28
+ if encoder_manager.use_cuda:
29
+ embedding = embedding.cuda()
30
+
31
+ class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
32
+ predicted_label = map_classid_to_classname[str(class_id)]
33
+ else:
34
+ predicted_label = None
35
+
36
+ if class_name is not None and predicted_label is not None:
37
+ is_equal = int(class_name == predicted_label)
38
+ if class_name not in class_acc_dict:
39
+ class_acc_dict[class_name] = [is_equal]
40
+ else:
41
+ class_acc_dict[class_name].append(is_equal)
42
+ else:
43
+ raise RuntimeError("Error: class_name or/and predicted_label are None")
44
+
45
+ acc_avg = 0
46
+ for key, values in class_acc_dict.items():
47
+ acc = sum(values) / len(values)
48
+ print("Class", key, "Accuracy:", acc)
49
+ acc_avg += acc
50
+
51
+ print("Average Accuracy:", acc_avg / len(class_acc_dict))
52
+
53
+
54
+ if __name__ == "__main__":
55
+ parser = argparse.ArgumentParser(
56
+ description="""Compute the accuracy of the encoder.\n\n"""
57
+ """
58
+ Example runs:
59
+ python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
60
+ """,
61
+ formatter_class=RawTextHelpFormatter,
62
+ )
63
+ parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
64
+ parser.add_argument(
65
+ "config_path",
66
+ type=str,
67
+ help="Path to model config file.",
68
+ )
69
+
70
+ parser.add_argument(
71
+ "config_dataset_path",
72
+ type=str,
73
+ help="Path to dataset config file.",
74
+ )
75
+ parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
76
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
77
+
78
+ args = parser.parse_args()
79
+
80
+ c_dataset = load_config(args.config_dataset_path)
81
+
82
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
83
+ items = meta_data_train + meta_data_eval
84
+
85
+ enc_manager = SpeakerManager(
86
+ encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
87
+ )
88
+
89
+ compute_encoder_accuracy(items, enc_manager)
TTS/bin/extract_tts_spectrograms.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Extract Mel spectrograms with teacher forcing."""
3
+
4
+ import argparse
5
+ import os
6
+
7
+ import numpy as np
8
+ import torch
9
+ from torch.utils.data import DataLoader
10
+ from tqdm import tqdm
11
+
12
+ from TTS.config import load_config
13
+ from TTS.tts.datasets import TTSDataset, load_tts_samples
14
+ from TTS.tts.models import setup_model
15
+ from TTS.tts.utils.speakers import SpeakerManager
16
+ from TTS.tts.utils.text.tokenizer import TTSTokenizer
17
+ from TTS.utils.audio import AudioProcessor
18
+ from TTS.utils.generic_utils import count_parameters
19
+
20
+ use_cuda = torch.cuda.is_available()
21
+
22
+
23
+ def setup_loader(ap, r, verbose=False):
24
+ tokenizer, _ = TTSTokenizer.init_from_config(c)
25
+ dataset = TTSDataset(
26
+ outputs_per_step=r,
27
+ compute_linear_spec=False,
28
+ samples=meta_data,
29
+ tokenizer=tokenizer,
30
+ ap=ap,
31
+ batch_group_size=0,
32
+ min_text_len=c.min_text_len,
33
+ max_text_len=c.max_text_len,
34
+ min_audio_len=c.min_audio_len,
35
+ max_audio_len=c.max_audio_len,
36
+ phoneme_cache_path=c.phoneme_cache_path,
37
+ precompute_num_workers=0,
38
+ use_noise_augment=False,
39
+ verbose=verbose,
40
+ speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None,
41
+ d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
42
+ )
43
+
44
+ if c.use_phonemes and c.compute_input_seq_cache:
45
+ # precompute phonemes to have a better estimate of sequence lengths.
46
+ dataset.compute_input_seq(c.num_loader_workers)
47
+ dataset.preprocess_samples()
48
+
49
+ loader = DataLoader(
50
+ dataset,
51
+ batch_size=c.batch_size,
52
+ shuffle=False,
53
+ collate_fn=dataset.collate_fn,
54
+ drop_last=False,
55
+ sampler=None,
56
+ num_workers=c.num_loader_workers,
57
+ pin_memory=False,
58
+ )
59
+ return loader
60
+
61
+
62
+ def set_filename(wav_path, out_path):
63
+ wav_file = os.path.basename(wav_path)
64
+ file_name = wav_file.split(".")[0]
65
+ os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
66
+ os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
67
+ os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
68
+ os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
69
+ wavq_path = os.path.join(out_path, "quant", file_name)
70
+ mel_path = os.path.join(out_path, "mel", file_name)
71
+ wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
72
+ wav_path = os.path.join(out_path, "wav", file_name + ".wav")
73
+ return file_name, wavq_path, mel_path, wav_gl_path, wav_path
74
+
75
+
76
+ def format_data(data):
77
+ # setup input data
78
+ text_input = data["token_id"]
79
+ text_lengths = data["token_id_lengths"]
80
+ mel_input = data["mel"]
81
+ mel_lengths = data["mel_lengths"]
82
+ item_idx = data["item_idxs"]
83
+ d_vectors = data["d_vectors"]
84
+ speaker_ids = data["speaker_ids"]
85
+ attn_mask = data["attns"]
86
+ avg_text_length = torch.mean(text_lengths.float())
87
+ avg_spec_length = torch.mean(mel_lengths.float())
88
+
89
+ # dispatch data to GPU
90
+ if use_cuda:
91
+ text_input = text_input.cuda(non_blocking=True)
92
+ text_lengths = text_lengths.cuda(non_blocking=True)
93
+ mel_input = mel_input.cuda(non_blocking=True)
94
+ mel_lengths = mel_lengths.cuda(non_blocking=True)
95
+ if speaker_ids is not None:
96
+ speaker_ids = speaker_ids.cuda(non_blocking=True)
97
+ if d_vectors is not None:
98
+ d_vectors = d_vectors.cuda(non_blocking=True)
99
+ if attn_mask is not None:
100
+ attn_mask = attn_mask.cuda(non_blocking=True)
101
+ return (
102
+ text_input,
103
+ text_lengths,
104
+ mel_input,
105
+ mel_lengths,
106
+ speaker_ids,
107
+ d_vectors,
108
+ avg_text_length,
109
+ avg_spec_length,
110
+ attn_mask,
111
+ item_idx,
112
+ )
113
+
114
+
115
+ @torch.no_grad()
116
+ def inference(
117
+ model_name,
118
+ model,
119
+ ap,
120
+ text_input,
121
+ text_lengths,
122
+ mel_input,
123
+ mel_lengths,
124
+ speaker_ids=None,
125
+ d_vectors=None,
126
+ ):
127
+ if model_name == "glow_tts":
128
+ speaker_c = None
129
+ if speaker_ids is not None:
130
+ speaker_c = speaker_ids
131
+ elif d_vectors is not None:
132
+ speaker_c = d_vectors
133
+ outputs = model.inference_with_MAS(
134
+ text_input,
135
+ text_lengths,
136
+ mel_input,
137
+ mel_lengths,
138
+ aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
139
+ )
140
+ model_output = outputs["model_outputs"]
141
+ model_output = model_output.detach().cpu().numpy()
142
+
143
+ elif "tacotron" in model_name:
144
+ aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
145
+ outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
146
+ postnet_outputs = outputs["model_outputs"]
147
+ # normalize tacotron output
148
+ if model_name == "tacotron":
149
+ mel_specs = []
150
+ postnet_outputs = postnet_outputs.data.cpu().numpy()
151
+ for b in range(postnet_outputs.shape[0]):
152
+ postnet_output = postnet_outputs[b]
153
+ mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
154
+ model_output = torch.stack(mel_specs).cpu().numpy()
155
+
156
+ elif model_name == "tacotron2":
157
+ model_output = postnet_outputs.detach().cpu().numpy()
158
+ return model_output
159
+
160
+
161
+ def extract_spectrograms(
162
+ data_loader, model, ap, output_path, quantized_wav=False, save_audio=False, debug=False, metada_name="metada.txt"
163
+ ):
164
+ model.eval()
165
+ export_metadata = []
166
+ for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
167
+
168
+ # format data
169
+ (
170
+ text_input,
171
+ text_lengths,
172
+ mel_input,
173
+ mel_lengths,
174
+ speaker_ids,
175
+ d_vectors,
176
+ _,
177
+ _,
178
+ _,
179
+ item_idx,
180
+ ) = format_data(data)
181
+
182
+ model_output = inference(
183
+ c.model.lower(),
184
+ model,
185
+ ap,
186
+ text_input,
187
+ text_lengths,
188
+ mel_input,
189
+ mel_lengths,
190
+ speaker_ids,
191
+ d_vectors,
192
+ )
193
+
194
+ for idx in range(text_input.shape[0]):
195
+ wav_file_path = item_idx[idx]
196
+ wav = ap.load_wav(wav_file_path)
197
+ _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
198
+
199
+ # quantize and save wav
200
+ if quantized_wav:
201
+ wavq = ap.quantize(wav)
202
+ np.save(wavq_path, wavq)
203
+
204
+ # save TTS mel
205
+ mel = model_output[idx]
206
+ mel_length = mel_lengths[idx]
207
+ mel = mel[:mel_length, :].T
208
+ np.save(mel_path, mel)
209
+
210
+ export_metadata.append([wav_file_path, mel_path])
211
+ if save_audio:
212
+ ap.save_wav(wav, wav_path)
213
+
214
+ if debug:
215
+ print("Audio for debug saved at:", wav_gl_path)
216
+ wav = ap.inv_melspectrogram(mel)
217
+ ap.save_wav(wav, wav_gl_path)
218
+
219
+ with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
220
+ for data in export_metadata:
221
+ f.write(f"{data[0]}|{data[1]+'.npy'}\n")
222
+
223
+
224
+ def main(args): # pylint: disable=redefined-outer-name
225
+ # pylint: disable=global-variable-undefined
226
+ global meta_data, speaker_manager
227
+
228
+ # Audio processor
229
+ ap = AudioProcessor(**c.audio)
230
+
231
+ # load data instances
232
+ meta_data_train, meta_data_eval = load_tts_samples(
233
+ c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
234
+ )
235
+
236
+ # use eval and training partitions
237
+ meta_data = meta_data_train + meta_data_eval
238
+
239
+ # init speaker manager
240
+ if c.use_speaker_embedding:
241
+ speaker_manager = SpeakerManager(data_items=meta_data)
242
+ elif c.use_d_vector_file:
243
+ speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
244
+ else:
245
+ speaker_manager = None
246
+
247
+ # setup model
248
+ model = setup_model(c)
249
+
250
+ # restore model
251
+ model.load_checkpoint(c, args.checkpoint_path, eval=True)
252
+
253
+ if use_cuda:
254
+ model.cuda()
255
+
256
+ num_params = count_parameters(model)
257
+ print("\n > Model has {} parameters".format(num_params), flush=True)
258
+ # set r
259
+ r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
260
+ own_loader = setup_loader(ap, r, verbose=True)
261
+
262
+ extract_spectrograms(
263
+ own_loader,
264
+ model,
265
+ ap,
266
+ args.output_path,
267
+ quantized_wav=args.quantized,
268
+ save_audio=args.save_audio,
269
+ debug=args.debug,
270
+ metada_name="metada.txt",
271
+ )
272
+
273
+
274
+ if __name__ == "__main__":
275
+ parser = argparse.ArgumentParser()
276
+ parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
277
+ parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
278
+ parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
279
+ parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
280
+ parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
281
+ parser.add_argument("--quantized", action="store_true", help="Save quantized audio files")
282
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
283
+ args = parser.parse_args()
284
+
285
+ c = load_config(args.config_path)
286
+ c.audio.trim_silence = False
287
+ main(args)
TTS/bin/find_unique_chars.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ from TTS.config import load_config
6
+ from TTS.tts.datasets import load_tts_samples
7
+
8
+
9
+ def main():
10
+ # pylint: disable=bad-option-value
11
+ parser = argparse.ArgumentParser(
12
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
13
+ """
14
+ Example runs:
15
+
16
+ python TTS/bin/find_unique_chars.py --config_path config.json
17
+ """,
18
+ formatter_class=RawTextHelpFormatter,
19
+ )
20
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
21
+ args = parser.parse_args()
22
+
23
+ c = load_config(args.config_path)
24
+
25
+ # load all datasets
26
+ train_items, eval_items = load_tts_samples(
27
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
28
+ )
29
+
30
+ items = train_items + eval_items
31
+
32
+ texts = "".join(item["text"] for item in items)
33
+ chars = set(texts)
34
+ lower_chars = filter(lambda c: c.islower(), chars)
35
+ chars_force_lower = [c.lower() for c in chars]
36
+ chars_force_lower = set(chars_force_lower)
37
+
38
+ print(f" > Number of unique characters: {len(chars)}")
39
+ print(f" > Unique characters: {''.join(sorted(chars))}")
40
+ print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
41
+ print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
42
+
43
+
44
+ if __name__ == "__main__":
45
+ main()
TTS/bin/find_unique_phonemes.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ import multiprocessing
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ from tqdm.contrib.concurrent import process_map
7
+
8
+ from TTS.config import load_config
9
+ from TTS.tts.datasets import load_tts_samples
10
+ from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
11
+
12
+ phonemizer = Gruut(language="en-us")
13
+
14
+
15
+ def compute_phonemes(item):
16
+ try:
17
+ text = item[0]
18
+ ph = phonemizer.phonemize(text).split("|")
19
+ except:
20
+ return []
21
+ return list(set(ph))
22
+
23
+
24
+ def main():
25
+ # pylint: disable=W0601
26
+ global c
27
+ # pylint: disable=bad-option-value
28
+ parser = argparse.ArgumentParser(
29
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
30
+ """
31
+ Example runs:
32
+
33
+ python TTS/bin/find_unique_chars.py --config_path config.json
34
+ """,
35
+ formatter_class=RawTextHelpFormatter,
36
+ )
37
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
38
+ args = parser.parse_args()
39
+
40
+ c = load_config(args.config_path)
41
+
42
+ # load all datasets
43
+ train_items, eval_items = load_tts_samples(
44
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
45
+ )
46
+ items = train_items + eval_items
47
+ print("Num items:", len(items))
48
+
49
+ is_lang_def = all(item["language"] for item in items)
50
+
51
+ if not c.phoneme_language or not is_lang_def:
52
+ raise ValueError("Phoneme language must be defined in config.")
53
+
54
+ phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
55
+ phones = []
56
+ for ph in phonemes:
57
+ phones.extend(ph)
58
+ phones = set(phones)
59
+ lower_phones = filter(lambda c: c.islower(), phones)
60
+ phones_force_lower = [c.lower() for c in phones]
61
+ phones_force_lower = set(phones_force_lower)
62
+
63
+ print(f" > Number of unique phonemes: {len(phones)}")
64
+ print(f" > Unique phonemes: {''.join(sorted(phones))}")
65
+ print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
66
+ print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
TTS/bin/remove_silence_using_vad.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import os
4
+ import pathlib
5
+
6
+ from tqdm import tqdm
7
+
8
+ from TTS.utils.vad import get_vad_model_and_utils, remove_silence
9
+
10
+
11
+ def adjust_path_and_remove_silence(audio_path):
12
+ output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
13
+ # ignore if the file exists
14
+ if os.path.exists(output_path) and not args.force:
15
+ return output_path
16
+
17
+ # create all directory structure
18
+ pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
19
+ # remove the silence and save the audio
20
+ output_path = remove_silence(
21
+ model_and_utils,
22
+ audio_path,
23
+ output_path,
24
+ trim_just_beginning_and_end=args.trim_just_beginning_and_end,
25
+ use_cuda=args.use_cuda,
26
+ )
27
+
28
+ return output_path
29
+
30
+
31
+ def preprocess_audios():
32
+ files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
33
+ print("> Number of files: ", len(files))
34
+ if not args.force:
35
+ print("> Ignoring files that already exist in the output directory.")
36
+
37
+ if args.trim_just_beginning_and_end:
38
+ print("> Trimming just the beginning and the end with nonspeech parts.")
39
+ else:
40
+ print("> Trimming all nonspeech parts.")
41
+
42
+ if files:
43
+ # create threads
44
+ # num_threads = multiprocessing.cpu_count()
45
+ # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
46
+ for f in tqdm(files):
47
+ adjust_path_and_remove_silence(f)
48
+ else:
49
+ print("> No files Found !")
50
+
51
+
52
+ if __name__ == "__main__":
53
+ parser = argparse.ArgumentParser(
54
+ description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
55
+ )
56
+ parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
57
+ parser.add_argument(
58
+ "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
59
+ )
60
+ parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
61
+ parser.add_argument(
62
+ "-g",
63
+ "--glob",
64
+ type=str,
65
+ default="**/*.wav",
66
+ help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
67
+ )
68
+ parser.add_argument(
69
+ "-t",
70
+ "--trim_just_beginning_and_end",
71
+ type=bool,
72
+ default=True,
73
+ help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
74
+ )
75
+ parser.add_argument(
76
+ "-c",
77
+ "--use_cuda",
78
+ type=bool,
79
+ default=False,
80
+ help="If True use cuda",
81
+ )
82
+ args = parser.parse_args()
83
+ # load the model and utils
84
+ model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
85
+ preprocess_audios()
TTS/bin/resample.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+ from distutils.dir_util import copy_tree
6
+ from multiprocessing import Pool
7
+
8
+ import librosa
9
+ import soundfile as sf
10
+ from tqdm import tqdm
11
+
12
+
13
+ def resample_file(func_args):
14
+ filename, output_sr = func_args
15
+ y, sr = librosa.load(filename, sr=output_sr)
16
+ sf.write(filename, y, sr)
17
+
18
+
19
+ if __name__ == "__main__":
20
+
21
+ parser = argparse.ArgumentParser(
22
+ description="""Resample a folder recusively with librosa
23
+ Can be used in place or create a copy of the folder as an output.\n\n
24
+ Example run:
25
+ python TTS/bin/resample.py
26
+ --input_dir /root/LJSpeech-1.1/
27
+ --output_sr 22050
28
+ --output_dir /root/resampled_LJSpeech-1.1/
29
+ --file_ext wav
30
+ --n_jobs 24
31
+ """,
32
+ formatter_class=RawTextHelpFormatter,
33
+ )
34
+
35
+ parser.add_argument(
36
+ "--input_dir",
37
+ type=str,
38
+ default=None,
39
+ required=True,
40
+ help="Path of the folder containing the audio files to resample",
41
+ )
42
+
43
+ parser.add_argument(
44
+ "--output_sr",
45
+ type=int,
46
+ default=22050,
47
+ required=False,
48
+ help="Samlple rate to which the audio files should be resampled",
49
+ )
50
+
51
+ parser.add_argument(
52
+ "--output_dir",
53
+ type=str,
54
+ default=None,
55
+ required=False,
56
+ help="Path of the destination folder. If not defined, the operation is done in place",
57
+ )
58
+
59
+ parser.add_argument(
60
+ "--file_ext",
61
+ type=str,
62
+ default="wav",
63
+ required=False,
64
+ help="Extension of the audio files to resample",
65
+ )
66
+
67
+ parser.add_argument(
68
+ "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
69
+ )
70
+
71
+ args = parser.parse_args()
72
+
73
+ if args.output_dir:
74
+ print("Recursively copying the input folder...")
75
+ copy_tree(args.input_dir, args.output_dir)
76
+ args.input_dir = args.output_dir
77
+
78
+ print("Resampling the audio files...")
79
+ audio_files = glob.glob(os.path.join(args.input_dir, f"**/*.{args.file_ext}"), recursive=True)
80
+ print(f"Found {len(audio_files)} files...")
81
+ audio_files = list(zip(audio_files, len(audio_files) * [args.output_sr]))
82
+ with Pool(processes=args.n_jobs) as p:
83
+ with tqdm(total=len(audio_files)) as pbar:
84
+ for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
85
+ pbar.update()
86
+
87
+ print("Done !")
TTS/bin/synthesize.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import sys
6
+ import pandas as pd
7
+ from argparse import RawTextHelpFormatter
8
+
9
+ # pylint: disable=redefined-outer-name, unused-argument
10
+ from pathlib import Path
11
+
12
+ from TTS.utils.manage import ModelManager
13
+ from TTS.utils.synthesizer import Synthesizer
14
+ from tqdm.auto import tqdm
15
+
16
+
17
+ def str2bool(v):
18
+ if isinstance(v, bool):
19
+ return v
20
+ if v.lower() in ("yes", "true", "t", "y", "1"):
21
+ return True
22
+ if v.lower() in ("no", "false", "f", "n", "0"):
23
+ return False
24
+ raise argparse.ArgumentTypeError("Boolean value expected.")
25
+
26
+
27
+ def main():
28
+ description = """Synthesize speech on command line.
29
+
30
+ You can either use your trained model or choose a model from the provided list.
31
+
32
+ If you don't specify any models, then it uses LJSpeech based English model.
33
+
34
+ ## Example Runs
35
+
36
+ ### Single Speaker Models
37
+
38
+ - List provided models:
39
+
40
+ ```
41
+ $ tts --list_models
42
+ ```
43
+
44
+ - Query info for model info by idx:
45
+
46
+ ```
47
+ $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
48
+ ```
49
+
50
+ - Query info for model info by full name:
51
+
52
+ ```
53
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
54
+ ```
55
+
56
+ - Run TTS with default models:
57
+
58
+ ```
59
+ $ tts --text "Text for TTS"
60
+ ```
61
+
62
+ - Run a TTS model with its default vocoder model:
63
+
64
+ ```
65
+ $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
66
+ ```
67
+
68
+ - Run with specific TTS and vocoder models from the list:
69
+
70
+ ```
71
+ $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
72
+ ```
73
+
74
+ - Run your own TTS model (Using Griffin-Lim Vocoder):
75
+
76
+ ```
77
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
78
+ ```
79
+
80
+ - Run your own TTS and Vocoder models:
81
+ ```
82
+ $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
83
+ --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
84
+ ```
85
+
86
+ ### Multi-speaker Models
87
+
88
+ - List the available speakers and choose as <speaker_id> among them:
89
+
90
+ ```
91
+ $ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
92
+ ```
93
+
94
+ - Run the multi-speaker TTS model with the target speaker ID:
95
+
96
+ ```
97
+ $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
98
+ ```
99
+
100
+ - Run your own multi-speaker TTS model:
101
+
102
+ ```
103
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
104
+ ```
105
+ """
106
+ # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
107
+ # documentation in sync more easily.
108
+ parser = argparse.ArgumentParser(
109
+ description=description.replace(" ```\n", ""),
110
+ formatter_class=RawTextHelpFormatter,
111
+ )
112
+
113
+ parser.add_argument(
114
+ "--list_models",
115
+ type=str2bool,
116
+ nargs="?",
117
+ const=True,
118
+ default=False,
119
+ help="list available pre-trained TTS and vocoder models.",
120
+ )
121
+
122
+ parser.add_argument(
123
+ "--model_info_by_idx",
124
+ type=str,
125
+ default=None,
126
+ help="model info using query format: <model_type>/<model_query_idx>",
127
+ )
128
+
129
+ parser.add_argument(
130
+ "--model_info_by_name",
131
+ type=str,
132
+ default=None,
133
+ help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
134
+ )
135
+
136
+ parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
137
+
138
+ #parser.add_argument("--text_file_path", type=str, default=None, help="A csv file in LJSpeech format ('|' seperated id, text and speaker) to generate speech.")
139
+ #parser.add_argument("--speaker_name_filter", type=str, default=None, help="Filter texts corresponding to a specific speaker in text_file_path ")
140
+
141
+ # Args for running pre-trained TTS models.
142
+ parser.add_argument(
143
+ "--model_name",
144
+ type=str,
145
+ default="tts_models/en/ljspeech/tacotron2-DDC",
146
+ help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
147
+ )
148
+ parser.add_argument(
149
+ "--vocoder_name",
150
+ type=str,
151
+ default=None,
152
+ help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>",
153
+ )
154
+
155
+ # Args for running custom models
156
+ parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
157
+ parser.add_argument(
158
+ "--model_path",
159
+ type=str,
160
+ default=None,
161
+ help="Path to model file.",
162
+ )
163
+ parser.add_argument(
164
+ "--out_path",
165
+ type=str,
166
+ default="tts_output.wav",
167
+ help="Output wav file path.",
168
+ )
169
+
170
+ # parser.add_argument(
171
+ # "--out_folder",
172
+ # type=str,
173
+ # default="tts_output",
174
+ # help="Output wav files folder.",
175
+ # )
176
+
177
+ parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
178
+ parser.add_argument(
179
+ "--vocoder_path",
180
+ type=str,
181
+ help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
182
+ default=None,
183
+ )
184
+ parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
185
+ parser.add_argument(
186
+ "--encoder_path",
187
+ type=str,
188
+ help="Path to speaker encoder model file.",
189
+ default=None,
190
+ )
191
+ parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
192
+
193
+ # args for multi-speaker synthesis
194
+ parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
195
+ parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
196
+ parser.add_argument(
197
+ "--speaker_idx",
198
+ type=str,
199
+ help="Target speaker ID for a multi-speaker TTS model.",
200
+ default=None,
201
+ )
202
+ parser.add_argument(
203
+ "--language_idx",
204
+ type=str,
205
+ help="Target language ID for a multi-lingual TTS model.",
206
+ default=None,
207
+ )
208
+ parser.add_argument(
209
+ "--speaker_wav",
210
+ nargs="+",
211
+ help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
212
+ default=None,
213
+ )
214
+ parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
215
+ parser.add_argument(
216
+ "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
217
+ )
218
+ parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
219
+ parser.add_argument(
220
+ "--list_speaker_idxs",
221
+ help="List available speaker ids for the defined multi-speaker model.",
222
+ type=str2bool,
223
+ nargs="?",
224
+ const=True,
225
+ default=False,
226
+ )
227
+ parser.add_argument(
228
+ "--list_language_idxs",
229
+ help="List available language ids for the defined multi-lingual model.",
230
+ type=str2bool,
231
+ nargs="?",
232
+ const=True,
233
+ default=False,
234
+ )
235
+ # aux args
236
+ parser.add_argument(
237
+ "--save_spectogram",
238
+ type=bool,
239
+ help="If true save raw spectogram for further (vocoder) processing in out_path.",
240
+ default=False,
241
+ )
242
+ parser.add_argument(
243
+ "--reference_wav",
244
+ type=str,
245
+ help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
246
+ default=None,
247
+ )
248
+ parser.add_argument(
249
+ "--reference_speaker_idx",
250
+ type=str,
251
+ help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
252
+ default=None,
253
+ )
254
+ args = parser.parse_args()
255
+
256
+ # print the description if either text or list_models is not set
257
+ check_args = [
258
+ args.text,
259
+ args.list_models,
260
+ args.list_speaker_idxs,
261
+ args.list_language_idxs,
262
+ args.reference_wav,
263
+ args.model_info_by_idx,
264
+ args.model_info_by_name,
265
+ ]
266
+ if not any(check_args):
267
+ parser.parse_args(["-h"])
268
+
269
+ # load model manager
270
+ path = Path(__file__).parent / "../.models.json"
271
+ manager = ModelManager(path)
272
+
273
+ model_path = None
274
+ config_path = None
275
+ speakers_file_path = None
276
+ language_ids_file_path = None
277
+ vocoder_path = None
278
+ vocoder_config_path = None
279
+ encoder_path = None
280
+ encoder_config_path = None
281
+
282
+ # CASE1 #list : list pre-trained TTS models
283
+ if args.list_models:
284
+ manager.list_models()
285
+ sys.exit()
286
+
287
+ # CASE2 #info : model info of pre-trained TTS models
288
+ if args.model_info_by_idx:
289
+ model_query = args.model_info_by_idx
290
+ manager.model_info_by_idx(model_query)
291
+ sys.exit()
292
+
293
+ if args.model_info_by_name:
294
+ model_query_full_name = args.model_info_by_name
295
+ manager.model_info_by_full_name(model_query_full_name)
296
+ sys.exit()
297
+
298
+ # CASE3: load pre-trained model paths
299
+ if args.model_name is not None and not args.model_path:
300
+ model_path, config_path, model_item = manager.download_model(args.model_name)
301
+ args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
302
+
303
+ if args.vocoder_name is not None and not args.vocoder_path:
304
+ vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
305
+
306
+ # CASE4: set custom model paths
307
+ if args.model_path is not None:
308
+ model_path = args.model_path
309
+ config_path = args.config_path
310
+ speakers_file_path = args.speakers_file_path
311
+ language_ids_file_path = args.language_ids_file_path
312
+
313
+ if args.vocoder_path is not None:
314
+ vocoder_path = args.vocoder_path
315
+ vocoder_config_path = args.vocoder_config_path
316
+
317
+ if args.encoder_path is not None:
318
+ encoder_path = args.encoder_path
319
+ encoder_config_path = args.encoder_config_path
320
+
321
+ # load models
322
+ synthesizer = Synthesizer(
323
+ model_path,
324
+ config_path,
325
+ speakers_file_path,
326
+ language_ids_file_path,
327
+ vocoder_path,
328
+ vocoder_config_path,
329
+ encoder_path,
330
+ encoder_config_path,
331
+ args.use_cuda,
332
+ )
333
+
334
+ # query speaker ids of a multi-speaker model.
335
+ if args.list_speaker_idxs:
336
+ print(
337
+ " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
338
+ )
339
+ print(synthesizer.tts_model.speaker_manager.ids)
340
+ return
341
+
342
+ # query langauge ids of a multi-lingual model.
343
+ if args.list_language_idxs:
344
+ print(
345
+ " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
346
+ )
347
+ print(synthesizer.tts_model.language_manager.ids)
348
+ return
349
+
350
+ # check the arguments against a multi-speaker model.
351
+ if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
352
+ print(
353
+ " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
354
+ "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
355
+ )
356
+ return
357
+
358
+ # RUN THE SYNTHESIS
359
+ if args.text.endswith('.csv'):
360
+ df = pd.read_csv(args.text, sep='|')
361
+ num_cols = df.shape[1]
362
+ columns = ['id', 'text', 'speaker_name', 'gender', 'text_len', 'audio_len', 'speaker_wav'][:num_cols]
363
+ df = pd.read_csv(args.text, sep='|', names=columns)
364
+ df = df.head(10)
365
+
366
+ # print(f'Number of examples before speaker filter: {len(df)}')
367
+ # if args.speaker_name_filter:
368
+ # df = df[df['speaker_name']==args.speaker_name_filter]
369
+ # print(f'Number of examples after speaker filter: {len(df)}')
370
+
371
+ if len(df) == 0:
372
+ raise ValueError("No records found.")
373
+
374
+ if 'speaker_wav' in df.columns:
375
+ for idx, row in tqdm(df.iterrows(), total=len(df), desc="Synthesizing"):
376
+ wav = synthesizer.tts(
377
+ text=row['text'],
378
+ speaker_name=None,
379
+ language_name=args.language_idx,
380
+ speaker_wav=row['speaker_wav'],
381
+ reference_wav=args.reference_wav,
382
+ style_wav=args.capacitron_style_wav,
383
+ style_text=args.capacitron_style_text,
384
+ reference_speaker_name=args.reference_speaker_idx,
385
+ )
386
+ synthesizer.save_wav(wav, f'{args.out_path}/{row["id"]}.wav')
387
+ else:
388
+ for idx, row in tqdm(df.iterrows(), total=len(df), desc="Synthesizing"):
389
+ wav = synthesizer.tts(
390
+ row['text'],
391
+ row['speaker_name'] if 'speaker_name' in df.columns else args.speaker_idx,
392
+ args.language_idx,
393
+ args.speaker_wav,
394
+ reference_wav=args.reference_wav,
395
+ style_wav=args.capacitron_style_wav,
396
+ style_text=args.capacitron_style_text,
397
+ reference_speaker_name=args.reference_speaker_idx,
398
+ )
399
+ synthesizer.save_wav(wav, f'{args.out_path}/{row["id"]}.wav')
400
+ print(" > Saved output wav files in {}".format(args.out_path))
401
+ return True
402
+
403
+ if args.text:
404
+ print(" > Text: {}".format(args.text))
405
+
406
+
407
+ # kick it
408
+ wav = synthesizer.tts(
409
+ args.text,
410
+ args.speaker_idx,
411
+ args.language_idx,
412
+ args.speaker_wav,
413
+ reference_wav=args.reference_wav,
414
+ style_wav=args.capacitron_style_wav,
415
+ style_text=args.capacitron_style_text,
416
+ reference_speaker_name=args.reference_speaker_idx,
417
+ )
418
+
419
+ # save the results
420
+ print(" > Saving output to {}".format(args.out_path))
421
+ synthesizer.save_wav(wav, args.out_path)
422
+
423
+
424
+ if __name__ == "__main__":
425
+ main()
TTS/bin/train_encoder.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ import sys
6
+ import time
7
+ import traceback
8
+
9
+ import torch
10
+ from torch.utils.data import DataLoader
11
+ from trainer.torch import NoamLR
12
+ from trainer.trainer_utils import get_optimizer
13
+
14
+ from TTS.encoder.dataset import EncoderDataset
15
+ from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
16
+ from TTS.encoder.utils.samplers import PerfectBatchSampler
17
+ from TTS.encoder.utils.training import init_training
18
+ from TTS.encoder.utils.visual import plot_embeddings
19
+ from TTS.tts.datasets import load_tts_samples
20
+ from TTS.utils.audio import AudioProcessor
21
+ from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
22
+ from TTS.utils.io import copy_model_files
23
+ from TTS.utils.training import check_update
24
+
25
+ torch.backends.cudnn.enabled = True
26
+ torch.backends.cudnn.benchmark = True
27
+ torch.manual_seed(54321)
28
+ use_cuda = torch.cuda.is_available()
29
+ num_gpus = torch.cuda.device_count()
30
+ print(" > Using CUDA: ", use_cuda)
31
+ print(" > Number of GPUs: ", num_gpus)
32
+
33
+
34
+ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
35
+ num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
36
+ num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
37
+
38
+ dataset = EncoderDataset(
39
+ c,
40
+ ap,
41
+ meta_data_eval if is_val else meta_data_train,
42
+ voice_len=c.voice_len,
43
+ num_utter_per_class=num_utter_per_class,
44
+ num_classes_in_batch=num_classes_in_batch,
45
+ verbose=verbose,
46
+ augmentation_config=c.audio_augmentation if not is_val else None,
47
+ use_torch_spec=c.model_params.get("use_torch_spec", False),
48
+ )
49
+ # get classes list
50
+ classes = dataset.get_class_list()
51
+
52
+ sampler = PerfectBatchSampler(
53
+ dataset.items,
54
+ classes,
55
+ batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
56
+ num_classes_in_batch=num_classes_in_batch,
57
+ num_gpus=1,
58
+ shuffle=not is_val,
59
+ drop_last=True,
60
+ )
61
+
62
+ if len(classes) < num_classes_in_batch:
63
+ if is_val:
64
+ raise RuntimeError(
65
+ f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
66
+ )
67
+ raise RuntimeError(
68
+ f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
69
+ )
70
+
71
+ # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
72
+ if is_val:
73
+ dataset.set_classes(train_classes)
74
+
75
+ loader = DataLoader(
76
+ dataset,
77
+ num_workers=c.num_loader_workers,
78
+ batch_sampler=sampler,
79
+ collate_fn=dataset.collate_fn,
80
+ )
81
+
82
+ return loader, classes, dataset.get_map_classid_to_classname()
83
+
84
+
85
+ def evaluation(model, criterion, data_loader, global_step):
86
+ eval_loss = 0
87
+ for _, data in enumerate(data_loader):
88
+ with torch.no_grad():
89
+ # setup input data
90
+ inputs, labels = data
91
+
92
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
93
+ labels = torch.transpose(
94
+ labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
95
+ ).reshape(labels.shape)
96
+ inputs = torch.transpose(
97
+ inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
98
+ ).reshape(inputs.shape)
99
+
100
+ # dispatch data to GPU
101
+ if use_cuda:
102
+ inputs = inputs.cuda(non_blocking=True)
103
+ labels = labels.cuda(non_blocking=True)
104
+
105
+ # forward pass model
106
+ outputs = model(inputs)
107
+
108
+ # loss computation
109
+ loss = criterion(
110
+ outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
111
+ )
112
+
113
+ eval_loss += loss.item()
114
+
115
+ eval_avg_loss = eval_loss / len(data_loader)
116
+ # save stats
117
+ dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
118
+ # plot the last batch in the evaluation
119
+ figures = {
120
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
121
+ }
122
+ dashboard_logger.eval_figures(global_step, figures)
123
+ return eval_avg_loss
124
+
125
+
126
+ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
127
+ model.train()
128
+ best_loss = float("inf")
129
+ avg_loader_time = 0
130
+ end_time = time.time()
131
+ for epoch in range(c.epochs):
132
+ tot_loss = 0
133
+ epoch_time = 0
134
+ for _, data in enumerate(data_loader):
135
+ start_time = time.time()
136
+
137
+ # setup input data
138
+ inputs, labels = data
139
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
140
+ labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
141
+ labels.shape
142
+ )
143
+ inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
144
+ inputs.shape
145
+ )
146
+ # ToDo: move it to a unit test
147
+ # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
148
+ # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
149
+ # idx = 0
150
+ # for j in range(0, c.num_classes_in_batch, 1):
151
+ # for i in range(j, len(labels), c.num_classes_in_batch):
152
+ # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
153
+ # print("Invalid")
154
+ # print(labels)
155
+ # exit()
156
+ # idx += 1
157
+ # labels = labels_converted
158
+ # inputs = inputs_converted
159
+
160
+ loader_time = time.time() - end_time
161
+ global_step += 1
162
+
163
+ # setup lr
164
+ if c.lr_decay:
165
+ scheduler.step()
166
+ optimizer.zero_grad()
167
+
168
+ # dispatch data to GPU
169
+ if use_cuda:
170
+ inputs = inputs.cuda(non_blocking=True)
171
+ labels = labels.cuda(non_blocking=True)
172
+
173
+ # forward pass model
174
+ outputs = model(inputs)
175
+
176
+ # loss computation
177
+ loss = criterion(
178
+ outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
179
+ )
180
+ loss.backward()
181
+ grad_norm, _ = check_update(model, c.grad_clip)
182
+ optimizer.step()
183
+
184
+ step_time = time.time() - start_time
185
+ epoch_time += step_time
186
+
187
+ # acumulate the total epoch loss
188
+ tot_loss += loss.item()
189
+
190
+ # Averaged Loader Time
191
+ num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
192
+ avg_loader_time = (
193
+ 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
194
+ if avg_loader_time != 0
195
+ else loader_time
196
+ )
197
+ current_lr = optimizer.param_groups[0]["lr"]
198
+
199
+ if global_step % c.steps_plot_stats == 0:
200
+ # Plot Training Epoch Stats
201
+ train_stats = {
202
+ "loss": loss.item(),
203
+ "lr": current_lr,
204
+ "grad_norm": grad_norm,
205
+ "step_time": step_time,
206
+ "avg_loader_time": avg_loader_time,
207
+ }
208
+ dashboard_logger.train_epoch_stats(global_step, train_stats)
209
+ figures = {
210
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
211
+ }
212
+ dashboard_logger.train_figures(global_step, figures)
213
+
214
+ if global_step % c.print_step == 0:
215
+ print(
216
+ " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
217
+ "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
218
+ global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
219
+ ),
220
+ flush=True,
221
+ )
222
+
223
+ if global_step % c.save_step == 0:
224
+ # save model
225
+ save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
226
+
227
+ end_time = time.time()
228
+
229
+ print("")
230
+ print(
231
+ ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
232
+ "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
233
+ epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
234
+ ),
235
+ flush=True,
236
+ )
237
+ # evaluation
238
+ if c.run_eval:
239
+ model.eval()
240
+ eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
241
+ print("\n\n")
242
+ print("--> EVAL PERFORMANCE")
243
+ print(
244
+ " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
245
+ flush=True,
246
+ )
247
+ # save the best checkpoint
248
+ best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
249
+ model.train()
250
+
251
+ return best_loss, global_step
252
+
253
+
254
+ def main(args): # pylint: disable=redefined-outer-name
255
+ # pylint: disable=global-variable-undefined
256
+ global meta_data_train
257
+ global meta_data_eval
258
+ global train_classes
259
+
260
+ ap = AudioProcessor(**c.audio)
261
+ model = setup_encoder_model(c)
262
+
263
+ optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
264
+
265
+ # pylint: disable=redefined-outer-name
266
+ meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
267
+
268
+ train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
269
+ if c.run_eval:
270
+ eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
271
+ else:
272
+ eval_data_loader = None
273
+
274
+ num_classes = len(train_classes)
275
+ criterion = model.get_criterion(c, num_classes)
276
+
277
+ if c.loss == "softmaxproto" and c.model != "speaker_encoder":
278
+ c.map_classid_to_classname = map_classid_to_classname
279
+ copy_model_files(c, OUT_PATH)
280
+
281
+ if args.restore_path:
282
+ criterion, args.restore_step = model.load_checkpoint(
283
+ c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
284
+ )
285
+ print(" > Model restored from step %d" % args.restore_step, flush=True)
286
+ else:
287
+ args.restore_step = 0
288
+
289
+ if c.lr_decay:
290
+ scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
291
+ else:
292
+ scheduler = None
293
+
294
+ num_params = count_parameters(model)
295
+ print("\n > Model has {} parameters".format(num_params), flush=True)
296
+
297
+ if use_cuda:
298
+ model = model.cuda()
299
+ criterion.cuda()
300
+
301
+ global_step = args.restore_step
302
+ _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
303
+
304
+
305
+ if __name__ == "__main__":
306
+ args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
307
+
308
+ try:
309
+ main(args)
310
+ except KeyboardInterrupt:
311
+ remove_experiment_folder(OUT_PATH)
312
+ try:
313
+ sys.exit(0)
314
+ except SystemExit:
315
+ os._exit(0) # pylint: disable=protected-access
316
+ except Exception: # pylint: disable=broad-except
317
+ remove_experiment_folder(OUT_PATH)
318
+ traceback.print_exc()
319
+ sys.exit(1)
TTS/bin/train_tts.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.config import load_config, register_config
7
+ from TTS.tts.datasets import load_tts_samples
8
+ from TTS.tts.models import setup_model
9
+
10
+
11
+ @dataclass
12
+ class TrainTTSArgs(TrainerArgs):
13
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
14
+
15
+
16
+ def main():
17
+ """Run `tts` model training directly by a `config.json` file."""
18
+ # init trainer args
19
+ train_args = TrainTTSArgs()
20
+ parser = train_args.init_argparse(arg_prefix="")
21
+
22
+ # override trainer args from comman-line args
23
+ args, config_overrides = parser.parse_known_args()
24
+ train_args.parse_args(args)
25
+
26
+ # load config.json and register
27
+ if args.config_path or args.continue_path:
28
+ if args.config_path:
29
+ # init from a file
30
+ config = load_config(args.config_path)
31
+ if len(config_overrides) > 0:
32
+ config.parse_known_args(config_overrides, relaxed_parser=True)
33
+ elif args.continue_path:
34
+ # continue from a prev experiment
35
+ config = load_config(os.path.join(args.continue_path, "config.json"))
36
+ if len(config_overrides) > 0:
37
+ config.parse_known_args(config_overrides, relaxed_parser=True)
38
+ else:
39
+ # init from console args
40
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
41
+
42
+ config_base = BaseTrainingConfig()
43
+ config_base.parse_known_args(config_overrides)
44
+ config = register_config(config_base.model)()
45
+
46
+ # load training samples
47
+ train_samples, eval_samples = load_tts_samples(
48
+ config.datasets,
49
+ eval_split=True,
50
+ eval_split_max_size=config.eval_split_max_size,
51
+ eval_split_size=config.eval_split_size,
52
+ )
53
+
54
+ # init the model from config
55
+ model = setup_model(config, train_samples + eval_samples)
56
+
57
+ # init the trainer and 🚀
58
+ trainer = Trainer(
59
+ train_args,
60
+ model.config,
61
+ config.output_path,
62
+ model=model,
63
+ train_samples=train_samples,
64
+ eval_samples=eval_samples,
65
+ parse_command_line_args=False,
66
+ )
67
+ trainer.fit()
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
TTS/bin/train_vocoder.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.config import load_config, register_config
7
+ from TTS.utils.audio import AudioProcessor
8
+ from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
9
+ from TTS.vocoder.models import setup_model
10
+
11
+
12
+ @dataclass
13
+ class TrainVocoderArgs(TrainerArgs):
14
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
15
+
16
+
17
+ def main():
18
+ """Run `tts` model training directly by a `config.json` file."""
19
+ # init trainer args
20
+ train_args = TrainVocoderArgs()
21
+ parser = train_args.init_argparse(arg_prefix="")
22
+
23
+ # override trainer args from comman-line args
24
+ args, config_overrides = parser.parse_known_args()
25
+ train_args.parse_args(args)
26
+
27
+ # load config.json and register
28
+ if args.config_path or args.continue_path:
29
+ if args.config_path:
30
+ # init from a file
31
+ config = load_config(args.config_path)
32
+ if len(config_overrides) > 0:
33
+ config.parse_known_args(config_overrides, relaxed_parser=True)
34
+ elif args.continue_path:
35
+ # continue from a prev experiment
36
+ config = load_config(os.path.join(args.continue_path, "config.json"))
37
+ if len(config_overrides) > 0:
38
+ config.parse_known_args(config_overrides, relaxed_parser=True)
39
+ else:
40
+ # init from console args
41
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
42
+
43
+ config_base = BaseTrainingConfig()
44
+ config_base.parse_known_args(config_overrides)
45
+ config = register_config(config_base.model)()
46
+
47
+ # load training samples
48
+ if "feature_path" in config and config.feature_path:
49
+ # load pre-computed features
50
+ print(f" > Loading features from: {config.feature_path}")
51
+ eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
52
+ else:
53
+ # load data raw wav files
54
+ eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
55
+
56
+ # setup audio processor
57
+ ap = AudioProcessor(**config.audio)
58
+
59
+ # init the model from config
60
+ model = setup_model(config)
61
+
62
+ # init the trainer and 🚀
63
+ trainer = Trainer(
64
+ train_args,
65
+ config,
66
+ config.output_path,
67
+ model=model,
68
+ train_samples=train_samples,
69
+ eval_samples=eval_samples,
70
+ training_assets={"audio_processor": ap},
71
+ parse_command_line_args=False,
72
+ )
73
+ trainer.fit()
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
TTS/bin/tune_wavegrad.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Search a good noise schedule for WaveGrad for a given number of inferece iterations"""
2
+ import argparse
3
+ from itertools import product as cartesian_product
4
+
5
+ import numpy as np
6
+ import torch
7
+ from torch.utils.data import DataLoader
8
+ from tqdm import tqdm
9
+
10
+ from TTS.utils.audio import AudioProcessor
11
+ from TTS.utils.io import load_config
12
+ from TTS.vocoder.datasets.preprocess import load_wav_data
13
+ from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
14
+ from TTS.vocoder.utils.generic_utils import setup_generator
15
+
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
18
+ parser.add_argument("--config_path", type=str, help="Path to model config file.")
19
+ parser.add_argument("--data_path", type=str, help="Path to data directory.")
20
+ parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
21
+ parser.add_argument(
22
+ "--num_iter", type=int, help="Number of model inference iterations that you like to optimize noise schedule for."
23
+ )
24
+ parser.add_argument("--use_cuda", type=bool, help="enable/disable CUDA.")
25
+ parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
26
+ parser.add_argument(
27
+ "--search_depth",
28
+ type=int,
29
+ default=3,
30
+ help="Search granularity. Increasing this increases the run-time exponentially.",
31
+ )
32
+
33
+ # load config
34
+ args = parser.parse_args()
35
+ config = load_config(args.config_path)
36
+
37
+ # setup audio processor
38
+ ap = AudioProcessor(**config.audio)
39
+
40
+ # load dataset
41
+ _, train_data = load_wav_data(args.data_path, 0)
42
+ train_data = train_data[: args.num_samples]
43
+ dataset = WaveGradDataset(
44
+ ap=ap,
45
+ items=train_data,
46
+ seq_len=-1,
47
+ hop_len=ap.hop_length,
48
+ pad_short=config.pad_short,
49
+ conv_pad=config.conv_pad,
50
+ is_training=True,
51
+ return_segments=False,
52
+ use_noise_augment=False,
53
+ use_cache=False,
54
+ verbose=True,
55
+ )
56
+ loader = DataLoader(
57
+ dataset,
58
+ batch_size=1,
59
+ shuffle=False,
60
+ collate_fn=dataset.collate_full_clips,
61
+ drop_last=False,
62
+ num_workers=config.num_loader_workers,
63
+ pin_memory=False,
64
+ )
65
+
66
+ # setup the model
67
+ model = setup_generator(config)
68
+ if args.use_cuda:
69
+ model.cuda()
70
+
71
+ # setup optimization parameters
72
+ base_values = sorted(10 * np.random.uniform(size=args.search_depth))
73
+ print(base_values)
74
+ exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
75
+ best_error = float("inf")
76
+ best_schedule = None
77
+ total_search_iter = len(base_values) ** args.num_iter
78
+ for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
79
+ beta = exponents * base
80
+ model.compute_noise_level(beta)
81
+ for data in loader:
82
+ mel, audio = data
83
+ y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
84
+
85
+ if args.use_cuda:
86
+ y_hat = y_hat.cpu()
87
+ y_hat = y_hat.numpy()
88
+
89
+ mel_hat = []
90
+ for i in range(y_hat.shape[0]):
91
+ m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
92
+ mel_hat.append(torch.from_numpy(m))
93
+
94
+ mel_hat = torch.stack(mel_hat)
95
+ mse = torch.sum((mel - mel_hat) ** 2).mean()
96
+ if mse.item() < best_error:
97
+ best_error = mse.item()
98
+ best_schedule = {"beta": beta}
99
+ print(f" > Found a better schedule. - MSE: {mse.item()}")
100
+ np.save(args.output_path, best_schedule)
TTS/config/__init__.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import Dict
5
+
6
+ import fsspec
7
+ import yaml
8
+ from coqpit import Coqpit
9
+
10
+ from TTS.config.shared_configs import *
11
+ from TTS.utils.generic_utils import find_module
12
+
13
+
14
+ def read_json_with_comments(json_path):
15
+ """for backward compat."""
16
+ # fallback to json
17
+ with fsspec.open(json_path, "r", encoding="utf-8") as f:
18
+ input_str = f.read()
19
+ # handle comments
20
+ input_str = re.sub(r"\\\n", "", input_str)
21
+ input_str = re.sub(r"//.*\n", "\n", input_str)
22
+ data = json.loads(input_str)
23
+ return data
24
+
25
+
26
+ def register_config(model_name: str) -> Coqpit:
27
+ """Find the right config for the given model name.
28
+
29
+ Args:
30
+ model_name (str): Model name.
31
+
32
+ Raises:
33
+ ModuleNotFoundError: No matching config for the model name.
34
+
35
+ Returns:
36
+ Coqpit: config class.
37
+ """
38
+ config_class = None
39
+ config_name = model_name + "_config"
40
+ paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs"]
41
+ for path in paths:
42
+ try:
43
+ config_class = find_module(path, config_name)
44
+ except ModuleNotFoundError:
45
+ pass
46
+ if config_class is None:
47
+ raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
48
+ return config_class
49
+
50
+
51
+ def _process_model_name(config_dict: Dict) -> str:
52
+ """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
53
+
54
+ Args:
55
+ config_dict (Dict): A dictionary including the config fields.
56
+
57
+ Returns:
58
+ str: Formatted modelname.
59
+ """
60
+ model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
61
+ model_name = model_name.replace("_generator", "").replace("_discriminator", "")
62
+ return model_name
63
+
64
+
65
+ def load_config(config_path: str) -> None:
66
+ """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
67
+ to find the corresponding Config class. Then initialize the Config.
68
+
69
+ Args:
70
+ config_path (str): path to the config file.
71
+
72
+ Raises:
73
+ TypeError: given config file has an unknown type.
74
+
75
+ Returns:
76
+ Coqpit: TTS config object.
77
+ """
78
+ config_dict = {}
79
+ ext = os.path.splitext(config_path)[1]
80
+ if ext in (".yml", ".yaml"):
81
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
82
+ data = yaml.safe_load(f)
83
+ elif ext == ".json":
84
+ try:
85
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
86
+ data = json.load(f)
87
+ except json.decoder.JSONDecodeError:
88
+ # backwards compat.
89
+ data = read_json_with_comments(config_path)
90
+ else:
91
+ raise TypeError(f" [!] Unknown config file type {ext}")
92
+ config_dict.update(data)
93
+ model_name = _process_model_name(config_dict)
94
+ config_class = register_config(model_name.lower())
95
+ config = config_class()
96
+ config.from_dict(config_dict)
97
+ return config
98
+
99
+
100
+ def check_config_and_model_args(config, arg_name, value):
101
+ """Check the give argument in `config.model_args` if exist or in `config` for
102
+ the given value.
103
+
104
+ Return False if the argument does not exist in `config.model_args` or `config`.
105
+ This is to patch up the compatibility between models with and without `model_args`.
106
+
107
+ TODO: Remove this in the future with a unified approach.
108
+ """
109
+ if hasattr(config, "model_args"):
110
+ if arg_name in config.model_args:
111
+ return config.model_args[arg_name] == value
112
+ if hasattr(config, arg_name):
113
+ return config[arg_name] == value
114
+ return False
115
+
116
+
117
+ def get_from_config_or_model_args(config, arg_name):
118
+ """Get the given argument from `config.model_args` if exist or in `config`."""
119
+ if hasattr(config, "model_args"):
120
+ if arg_name in config.model_args:
121
+ return config.model_args[arg_name]
122
+ return config[arg_name]
123
+
124
+
125
+ def get_from_config_or_model_args_with_default(config, arg_name, def_val):
126
+ """Get the given argument from `config.model_args` if exist or in `config`."""
127
+ if hasattr(config, "model_args"):
128
+ if arg_name in config.model_args:
129
+ return config.model_args[arg_name]
130
+ if hasattr(config, arg_name):
131
+ return config[arg_name]
132
+ return def_val
TTS/config/shared_configs.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+ from typing import List
3
+
4
+ from coqpit import Coqpit, check_argument
5
+ from trainer import TrainerConfig
6
+
7
+
8
+ @dataclass
9
+ class BaseAudioConfig(Coqpit):
10
+ """Base config to definge audio processing parameters. It is used to initialize
11
+ ```TTS.utils.audio.AudioProcessor.```
12
+
13
+ Args:
14
+ fft_size (int):
15
+ Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
16
+
17
+ win_length (int):
18
+ Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
19
+ ```fft_size```. Defaults to 1024.
20
+
21
+ hop_length (int):
22
+ Number of audio samples between adjacent STFT columns. Defaults to 1024.
23
+
24
+ frame_shift_ms (int):
25
+ Set ```hop_length``` based on milliseconds and sampling rate.
26
+
27
+ frame_length_ms (int):
28
+ Set ```win_length``` based on milliseconds and sampling rate.
29
+
30
+ stft_pad_mode (str):
31
+ Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
32
+
33
+ sample_rate (int):
34
+ Audio sampling rate. Defaults to 22050.
35
+
36
+ resample (bool):
37
+ Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
38
+
39
+ preemphasis (float):
40
+ Preemphasis coefficient. Defaults to 0.0.
41
+
42
+ ref_level_db (int): 20
43
+ Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
44
+ Defaults to 20.
45
+
46
+ do_sound_norm (bool):
47
+ Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
48
+
49
+ log_func (str):
50
+ Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
51
+
52
+ do_trim_silence (bool):
53
+ Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
54
+
55
+ do_amp_to_db_linear (bool, optional):
56
+ enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
57
+
58
+ do_amp_to_db_mel (bool, optional):
59
+ enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
60
+
61
+ pitch_fmax (float, optional):
62
+ Maximum frequency of the F0 frames. Defaults to ```640```.
63
+
64
+ pitch_fmin (float, optional):
65
+ Minimum frequency of the F0 frames. Defaults to ```0```.
66
+
67
+ trim_db (int):
68
+ Silence threshold used for silence trimming. Defaults to 45.
69
+
70
+ do_rms_norm (bool, optional):
71
+ enable/disable RMS volume normalization when loading an audio file. Defaults to False.
72
+
73
+ db_level (int, optional):
74
+ dB level used for rms normalization. The range is -99 to 0. Defaults to None.
75
+
76
+ power (float):
77
+ Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
78
+ artifacts in the synthesized voice. Defaults to 1.5.
79
+
80
+ griffin_lim_iters (int):
81
+ Number of Griffing Lim iterations. Defaults to 60.
82
+
83
+ num_mels (int):
84
+ Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
85
+
86
+ mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
87
+ It needs to be adjusted for a dataset. Defaults to 0.
88
+
89
+ mel_fmax (float):
90
+ Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
91
+
92
+ spec_gain (int):
93
+ Gain applied when converting amplitude to DB. Defaults to 20.
94
+
95
+ signal_norm (bool):
96
+ enable/disable signal normalization. Defaults to True.
97
+
98
+ min_level_db (int):
99
+ minimum db threshold for the computed melspectrograms. Defaults to -100.
100
+
101
+ symmetric_norm (bool):
102
+ enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
103
+ [0, k], Defaults to True.
104
+
105
+ max_norm (float):
106
+ ```k``` defining the normalization range. Defaults to 4.0.
107
+
108
+ clip_norm (bool):
109
+ enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
110
+
111
+ stats_path (str):
112
+ Path to the computed stats file. Defaults to None.
113
+ """
114
+
115
+ # stft parameters
116
+ fft_size: int = 1024
117
+ win_length: int = 1024
118
+ hop_length: int = 256
119
+ frame_shift_ms: int = None
120
+ frame_length_ms: int = None
121
+ stft_pad_mode: str = "reflect"
122
+ # audio processing parameters
123
+ sample_rate: int = 22050
124
+ resample: bool = False
125
+ preemphasis: float = 0.0
126
+ ref_level_db: int = 20
127
+ do_sound_norm: bool = False
128
+ log_func: str = "np.log10"
129
+ # silence trimming
130
+ do_trim_silence: bool = True
131
+ trim_db: int = 45
132
+ # rms volume normalization
133
+ do_rms_norm: bool = False
134
+ db_level: float = None
135
+ # griffin-lim params
136
+ power: float = 1.5
137
+ griffin_lim_iters: int = 60
138
+ # mel-spec params
139
+ num_mels: int = 80
140
+ mel_fmin: float = 0.0
141
+ mel_fmax: float = None
142
+ spec_gain: int = 20
143
+ do_amp_to_db_linear: bool = True
144
+ do_amp_to_db_mel: bool = True
145
+ # f0 params
146
+ pitch_fmax: float = 640.0
147
+ pitch_fmin: float = 0.0
148
+ # normalization params
149
+ signal_norm: bool = True
150
+ min_level_db: int = -100
151
+ symmetric_norm: bool = True
152
+ max_norm: float = 4.0
153
+ clip_norm: bool = True
154
+ stats_path: str = None
155
+
156
+ def check_values(
157
+ self,
158
+ ):
159
+ """Check config fields"""
160
+ c = asdict(self)
161
+ check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
162
+ check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
163
+ check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
164
+ check_argument(
165
+ "frame_length_ms",
166
+ c,
167
+ restricted=True,
168
+ min_val=10,
169
+ max_val=1000,
170
+ alternative="win_length",
171
+ )
172
+ check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
173
+ check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
174
+ check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
175
+ check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
176
+ check_argument("power", c, restricted=True, min_val=1, max_val=5)
177
+ check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
178
+
179
+ # normalization parameters
180
+ check_argument("signal_norm", c, restricted=True)
181
+ check_argument("symmetric_norm", c, restricted=True)
182
+ check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
183
+ check_argument("clip_norm", c, restricted=True)
184
+ check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
185
+ check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
186
+ check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
187
+ check_argument("do_trim_silence", c, restricted=True)
188
+ check_argument("trim_db", c, restricted=True)
189
+
190
+
191
+ @dataclass
192
+ class BaseDatasetConfig(Coqpit):
193
+ """Base config for TTS datasets.
194
+
195
+ Args:
196
+ name (str):
197
+ Dataset name that defines the preprocessor in use. Defaults to None.
198
+
199
+ path (str):
200
+ Root path to the dataset files. Defaults to None.
201
+
202
+ meta_file_train (str):
203
+ Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
204
+ Defaults to None.
205
+
206
+ ignored_speakers (List):
207
+ List of speakers IDs that are not used at the training. Default None.
208
+
209
+ language (str):
210
+ Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to None.
211
+
212
+ meta_file_val (str):
213
+ Name of the dataset meta file that defines the instances used at validation.
214
+
215
+ meta_file_attn_mask (str):
216
+ Path to the file that lists the attention mask files used with models that require attention masks to
217
+ train the duration predictor.
218
+ """
219
+
220
+ name: str = ""
221
+ path: str = ""
222
+ meta_file_train: str = ""
223
+ ignored_speakers: List[str] = None
224
+ language: str = ""
225
+ meta_file_val: str = ""
226
+ meta_file_attn_mask: str = ""
227
+
228
+ def check_values(
229
+ self,
230
+ ):
231
+ """Check config fields"""
232
+ c = asdict(self)
233
+ check_argument("name", c, restricted=True)
234
+ check_argument("path", c, restricted=True)
235
+ check_argument("meta_file_train", c, restricted=True)
236
+ check_argument("meta_file_val", c, restricted=False)
237
+ check_argument("meta_file_attn_mask", c, restricted=False)
238
+
239
+
240
+ @dataclass
241
+ class BaseTrainingConfig(TrainerConfig):
242
+ """Base config to define the basic 🐸TTS training parameters that are shared
243
+ among all the models. It is based on ```Trainer.TrainingConfig```.
244
+
245
+ Args:
246
+ model (str):
247
+ Name of the model that is used in the training.
248
+
249
+ num_loader_workers (int):
250
+ Number of workers for training time dataloader.
251
+
252
+ num_eval_loader_workers (int):
253
+ Number of workers for evaluation time dataloader.
254
+ """
255
+
256
+ model: str = None
257
+ # dataloading
258
+ num_loader_workers: int = 0
259
+ num_eval_loader_workers: int = 0
260
+ use_noise_augment: bool = False
TTS/encoder/README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Speaker Encoder
2
+
3
+ This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
4
+
5
+ With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
6
+
7
+ Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
8
+
9
+ ![](umap.png)
10
+
11
+ Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
12
+
13
+ To run the code, you need to follow the same flow as in TTS.
14
+
15
+ - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
16
+ - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
17
+ - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
18
+ - Watch training on Tensorboard as in TTS
TTS/encoder/__init__.py ADDED
File without changes
TTS/encoder/configs/base_encoder_config.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass, field
2
+ from typing import Dict, List
3
+
4
+ from coqpit import MISSING
5
+
6
+ from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
7
+
8
+
9
+ @dataclass
10
+ class BaseEncoderConfig(BaseTrainingConfig):
11
+ """Defines parameters for a Generic Encoder model."""
12
+
13
+ model: str = None
14
+ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
15
+ datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
16
+ # model params
17
+ model_params: Dict = field(
18
+ default_factory=lambda: {
19
+ "model_name": "lstm",
20
+ "input_dim": 80,
21
+ "proj_dim": 256,
22
+ "lstm_dim": 768,
23
+ "num_lstm_layers": 3,
24
+ "use_lstm_with_projection": True,
25
+ }
26
+ )
27
+
28
+ audio_augmentation: Dict = field(default_factory=lambda: {})
29
+
30
+ # training params
31
+ epochs: int = 10000
32
+ loss: str = "angleproto"
33
+ grad_clip: float = 3.0
34
+ lr: float = 0.0001
35
+ optimizer: str = "radam"
36
+ optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
37
+ lr_decay: bool = False
38
+ warmup_steps: int = 4000
39
+
40
+ # logging params
41
+ tb_model_param_stats: bool = False
42
+ steps_plot_stats: int = 10
43
+ save_step: int = 1000
44
+ print_step: int = 20
45
+ run_eval: bool = False
46
+
47
+ # data loader
48
+ num_classes_in_batch: int = MISSING
49
+ num_utter_per_class: int = MISSING
50
+ eval_num_classes_in_batch: int = None
51
+ eval_num_utter_per_class: int = None
52
+
53
+ num_loader_workers: int = MISSING
54
+ voice_len: float = 1.6
55
+
56
+ def check_values(self):
57
+ super().check_values()
58
+ c = asdict(self)
59
+ assert (
60
+ c["model_params"]["input_dim"] == self.audio.num_mels
61
+ ), " [!] model input dimendion must be equal to melspectrogram dimension."
TTS/encoder/configs/emotion_encoder_config.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+
3
+ from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4
+
5
+
6
+ @dataclass
7
+ class EmotionEncoderConfig(BaseEncoderConfig):
8
+ """Defines parameters for Emotion Encoder model."""
9
+
10
+ model: str = "emotion_encoder"
11
+ map_classid_to_classname: dict = None
12
+ class_name_key: str = "emotion_name"
TTS/encoder/configs/speaker_encoder_config.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+
3
+ from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4
+
5
+
6
+ @dataclass
7
+ class SpeakerEncoderConfig(BaseEncoderConfig):
8
+ """Defines parameters for Speaker Encoder model."""
9
+
10
+ model: str = "speaker_encoder"
11
+ class_name_key: str = "speaker_name"
TTS/encoder/dataset.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ import torch
4
+ from torch.utils.data import Dataset
5
+
6
+ from TTS.encoder.utils.generic_utils import AugmentWAV
7
+
8
+
9
+ class EncoderDataset(Dataset):
10
+ def __init__(
11
+ self,
12
+ config,
13
+ ap,
14
+ meta_data,
15
+ voice_len=1.6,
16
+ num_classes_in_batch=64,
17
+ num_utter_per_class=10,
18
+ verbose=False,
19
+ augmentation_config=None,
20
+ use_torch_spec=None,
21
+ ):
22
+ """
23
+ Args:
24
+ ap (TTS.tts.utils.AudioProcessor): audio processor object.
25
+ meta_data (list): list of dataset instances.
26
+ seq_len (int): voice segment length in seconds.
27
+ verbose (bool): print diagnostic information.
28
+ """
29
+ super().__init__()
30
+ self.config = config
31
+ self.items = meta_data
32
+ self.sample_rate = ap.sample_rate
33
+ self.seq_len = int(voice_len * self.sample_rate)
34
+ self.num_utter_per_class = num_utter_per_class
35
+ self.ap = ap
36
+ self.verbose = verbose
37
+ self.use_torch_spec = use_torch_spec
38
+ self.classes, self.items = self.__parse_items()
39
+
40
+ self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
41
+
42
+ # Data Augmentation
43
+ self.augmentator = None
44
+ self.gaussian_augmentation_config = None
45
+ if augmentation_config:
46
+ self.data_augmentation_p = augmentation_config["p"]
47
+ if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
48
+ self.augmentator = AugmentWAV(ap, augmentation_config)
49
+
50
+ if "gaussian" in augmentation_config.keys():
51
+ self.gaussian_augmentation_config = augmentation_config["gaussian"]
52
+
53
+ if self.verbose:
54
+ print("\n > DataLoader initialization")
55
+ print(f" | > Classes per Batch: {num_classes_in_batch}")
56
+ print(f" | > Number of instances : {len(self.items)}")
57
+ print(f" | > Sequence length: {self.seq_len}")
58
+ print(f" | > Num Classes: {len(self.classes)}")
59
+ print(f" | > Classes: {self.classes}")
60
+
61
+ def load_wav(self, filename):
62
+ audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
63
+ return audio
64
+
65
+ def __parse_items(self):
66
+ class_to_utters = {}
67
+ for item in self.items:
68
+ path_ = item["audio_file"]
69
+ class_name = item[self.config.class_name_key]
70
+ if class_name in class_to_utters.keys():
71
+ class_to_utters[class_name].append(path_)
72
+ else:
73
+ class_to_utters[class_name] = [
74
+ path_,
75
+ ]
76
+
77
+ # skip classes with number of samples >= self.num_utter_per_class
78
+ class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
79
+
80
+ classes = list(class_to_utters.keys())
81
+ classes.sort()
82
+
83
+ new_items = []
84
+ for item in self.items:
85
+ path_ = item["audio_file"]
86
+ class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
87
+ # ignore filtered classes
88
+ if class_name not in classes:
89
+ continue
90
+ # ignore small audios
91
+ if self.load_wav(path_).shape[0] - self.seq_len <= 0:
92
+ continue
93
+
94
+ new_items.append({"wav_file_path": path_, "class_name": class_name})
95
+
96
+ return classes, new_items
97
+
98
+ def __len__(self):
99
+ return len(self.items)
100
+
101
+ def get_num_classes(self):
102
+ return len(self.classes)
103
+
104
+ def get_class_list(self):
105
+ return self.classes
106
+
107
+ def set_classes(self, classes):
108
+ self.classes = classes
109
+ self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
110
+
111
+ def get_map_classid_to_classname(self):
112
+ return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
113
+
114
+ def __getitem__(self, idx):
115
+ return self.items[idx]
116
+
117
+ def collate_fn(self, batch):
118
+ # get the batch class_ids
119
+ labels = []
120
+ feats = []
121
+ for item in batch:
122
+ utter_path = item["wav_file_path"]
123
+ class_name = item["class_name"]
124
+
125
+ # get classid
126
+ class_id = self.classname_to_classid[class_name]
127
+ # load wav file
128
+ wav = self.load_wav(utter_path)
129
+ offset = random.randint(0, wav.shape[0] - self.seq_len)
130
+ wav = wav[offset : offset + self.seq_len]
131
+
132
+ if self.augmentator is not None and self.data_augmentation_p:
133
+ if random.random() < self.data_augmentation_p:
134
+ wav = self.augmentator.apply_one(wav)
135
+
136
+ if not self.use_torch_spec:
137
+ mel = self.ap.melspectrogram(wav)
138
+ feats.append(torch.FloatTensor(mel))
139
+ else:
140
+ feats.append(torch.FloatTensor(wav))
141
+
142
+ labels.append(class_id)
143
+
144
+ feats = torch.stack(feats)
145
+ labels = torch.LongTensor(labels)
146
+
147
+ return feats, labels
TTS/encoder/losses.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from torch import nn
4
+
5
+
6
+ # adapted from https://github.com/cvqluu/GE2E-Loss
7
+ class GE2ELoss(nn.Module):
8
+ def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
9
+ """
10
+ Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
11
+ Accepts an input of size (N, M, D)
12
+ where N is the number of speakers in the batch,
13
+ M is the number of utterances per speaker,
14
+ and D is the dimensionality of the embedding vector (e.g. d-vector)
15
+ Args:
16
+ - init_w (float): defines the initial value of w in Equation (5) of [1]
17
+ - init_b (float): definies the initial value of b in Equation (5) of [1]
18
+ """
19
+ super().__init__()
20
+ # pylint: disable=E1102
21
+ self.w = nn.Parameter(torch.tensor(init_w))
22
+ # pylint: disable=E1102
23
+ self.b = nn.Parameter(torch.tensor(init_b))
24
+ self.loss_method = loss_method
25
+
26
+ print(" > Initialized Generalized End-to-End loss")
27
+
28
+ assert self.loss_method in ["softmax", "contrast"]
29
+
30
+ if self.loss_method == "softmax":
31
+ self.embed_loss = self.embed_loss_softmax
32
+ if self.loss_method == "contrast":
33
+ self.embed_loss = self.embed_loss_contrast
34
+
35
+ # pylint: disable=R0201
36
+ def calc_new_centroids(self, dvecs, centroids, spkr, utt):
37
+ """
38
+ Calculates the new centroids excluding the reference utterance
39
+ """
40
+ excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
41
+ excl = torch.mean(excl, 0)
42
+ new_centroids = []
43
+ for i, centroid in enumerate(centroids):
44
+ if i == spkr:
45
+ new_centroids.append(excl)
46
+ else:
47
+ new_centroids.append(centroid)
48
+ return torch.stack(new_centroids)
49
+
50
+ def calc_cosine_sim(self, dvecs, centroids):
51
+ """
52
+ Make the cosine similarity matrix with dims (N,M,N)
53
+ """
54
+ cos_sim_matrix = []
55
+ for spkr_idx, speaker in enumerate(dvecs):
56
+ cs_row = []
57
+ for utt_idx, utterance in enumerate(speaker):
58
+ new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
59
+ # vector based cosine similarity for speed
60
+ cs_row.append(
61
+ torch.clamp(
62
+ torch.mm(
63
+ utterance.unsqueeze(1).transpose(0, 1),
64
+ new_centroids.transpose(0, 1),
65
+ )
66
+ / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
67
+ 1e-6,
68
+ )
69
+ )
70
+ cs_row = torch.cat(cs_row, dim=0)
71
+ cos_sim_matrix.append(cs_row)
72
+ return torch.stack(cos_sim_matrix)
73
+
74
+ # pylint: disable=R0201
75
+ def embed_loss_softmax(self, dvecs, cos_sim_matrix):
76
+ """
77
+ Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
78
+ """
79
+ N, M, _ = dvecs.shape
80
+ L = []
81
+ for j in range(N):
82
+ L_row = []
83
+ for i in range(M):
84
+ L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
85
+ L_row = torch.stack(L_row)
86
+ L.append(L_row)
87
+ return torch.stack(L)
88
+
89
+ # pylint: disable=R0201
90
+ def embed_loss_contrast(self, dvecs, cos_sim_matrix):
91
+ """
92
+ Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
93
+ """
94
+ N, M, _ = dvecs.shape
95
+ L = []
96
+ for j in range(N):
97
+ L_row = []
98
+ for i in range(M):
99
+ centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
100
+ excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]))
101
+ L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
102
+ L_row = torch.stack(L_row)
103
+ L.append(L_row)
104
+ return torch.stack(L)
105
+
106
+ def forward(self, x, _label=None):
107
+ """
108
+ Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
109
+ """
110
+
111
+ assert x.size()[1] >= 2
112
+
113
+ centroids = torch.mean(x, 1)
114
+ cos_sim_matrix = self.calc_cosine_sim(x, centroids)
115
+ torch.clamp(self.w, 1e-6)
116
+ cos_sim_matrix = self.w * cos_sim_matrix + self.b
117
+ L = self.embed_loss(x, cos_sim_matrix)
118
+ return L.mean()
119
+
120
+
121
+ # adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
122
+ class AngleProtoLoss(nn.Module):
123
+ """
124
+ Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
125
+ Accepts an input of size (N, M, D)
126
+ where N is the number of speakers in the batch,
127
+ M is the number of utterances per speaker,
128
+ and D is the dimensionality of the embedding vector
129
+ Args:
130
+ - init_w (float): defines the initial value of w
131
+ - init_b (float): definies the initial value of b
132
+ """
133
+
134
+ def __init__(self, init_w=10.0, init_b=-5.0):
135
+ super().__init__()
136
+ # pylint: disable=E1102
137
+ self.w = nn.Parameter(torch.tensor(init_w))
138
+ # pylint: disable=E1102
139
+ self.b = nn.Parameter(torch.tensor(init_b))
140
+ self.criterion = torch.nn.CrossEntropyLoss()
141
+
142
+ print(" > Initialized Angular Prototypical loss")
143
+
144
+ def forward(self, x, _label=None):
145
+ """
146
+ Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
147
+ """
148
+
149
+ assert x.size()[1] >= 2
150
+
151
+ out_anchor = torch.mean(x[:, 1:, :], 1)
152
+ out_positive = x[:, 0, :]
153
+ num_speakers = out_anchor.size()[0]
154
+
155
+ cos_sim_matrix = F.cosine_similarity(
156
+ out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),
157
+ out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2),
158
+ )
159
+ torch.clamp(self.w, 1e-6)
160
+ cos_sim_matrix = cos_sim_matrix * self.w + self.b
161
+ label = torch.arange(num_speakers).to(cos_sim_matrix.device)
162
+ L = self.criterion(cos_sim_matrix, label)
163
+ return L
164
+
165
+
166
+ class SoftmaxLoss(nn.Module):
167
+ """
168
+ Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
169
+ Args:
170
+ - embedding_dim (float): speaker embedding dim
171
+ - n_speakers (float): number of speakers
172
+ """
173
+
174
+ def __init__(self, embedding_dim, n_speakers):
175
+ super().__init__()
176
+
177
+ self.criterion = torch.nn.CrossEntropyLoss()
178
+ self.fc = nn.Linear(embedding_dim, n_speakers)
179
+
180
+ print("Initialised Softmax Loss")
181
+
182
+ def forward(self, x, label=None):
183
+ # reshape for compatibility
184
+ x = x.reshape(-1, x.size()[-1])
185
+ label = label.reshape(-1)
186
+
187
+ x = self.fc(x)
188
+ L = self.criterion(x, label)
189
+
190
+ return L
191
+
192
+ def inference(self, embedding):
193
+ x = self.fc(embedding)
194
+ activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
195
+ class_id = torch.argmax(activations)
196
+ return class_id
197
+
198
+
199
+ class SoftmaxAngleProtoLoss(nn.Module):
200
+ """
201
+ Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
202
+ Args:
203
+ - embedding_dim (float): speaker embedding dim
204
+ - n_speakers (float): number of speakers
205
+ - init_w (float): defines the initial value of w
206
+ - init_b (float): definies the initial value of b
207
+ """
208
+
209
+ def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
210
+ super().__init__()
211
+
212
+ self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
213
+ self.angleproto = AngleProtoLoss(init_w, init_b)
214
+
215
+ print("Initialised SoftmaxAnglePrototypical Loss")
216
+
217
+ def forward(self, x, label=None):
218
+ """
219
+ Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
220
+ """
221
+
222
+ Lp = self.angleproto(x)
223
+
224
+ Ls = self.softmax(x, label)
225
+
226
+ return Ls + Lp
TTS/encoder/models/base_encoder.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torchaudio
4
+ from coqpit import Coqpit
5
+ from torch import nn
6
+
7
+ from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
8
+ from TTS.utils.generic_utils import set_init_dict
9
+ from TTS.utils.io import load_fsspec
10
+
11
+
12
+ class PreEmphasis(nn.Module):
13
+ def __init__(self, coefficient=0.97):
14
+ super().__init__()
15
+ self.coefficient = coefficient
16
+ self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
17
+
18
+ def forward(self, x):
19
+ assert len(x.size()) == 2
20
+
21
+ x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
22
+ return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
23
+
24
+
25
+ class BaseEncoder(nn.Module):
26
+ """Base `encoder` class. Every new `encoder` model must inherit this.
27
+
28
+ It defines common `encoder` specific functions.
29
+ """
30
+
31
+ # pylint: disable=W0102
32
+ def __init__(self):
33
+ super(BaseEncoder, self).__init__()
34
+
35
+ def get_torch_mel_spectrogram_class(self, audio_config):
36
+ return torch.nn.Sequential(
37
+ PreEmphasis(audio_config["preemphasis"]),
38
+ # TorchSTFT(
39
+ # n_fft=audio_config["fft_size"],
40
+ # hop_length=audio_config["hop_length"],
41
+ # win_length=audio_config["win_length"],
42
+ # sample_rate=audio_config["sample_rate"],
43
+ # window="hamming_window",
44
+ # mel_fmin=0.0,
45
+ # mel_fmax=None,
46
+ # use_htk=True,
47
+ # do_amp_to_db=False,
48
+ # n_mels=audio_config["num_mels"],
49
+ # power=2.0,
50
+ # use_mel=True,
51
+ # mel_norm=None,
52
+ # )
53
+ torchaudio.transforms.MelSpectrogram(
54
+ sample_rate=audio_config["sample_rate"],
55
+ n_fft=audio_config["fft_size"],
56
+ win_length=audio_config["win_length"],
57
+ hop_length=audio_config["hop_length"],
58
+ window_fn=torch.hamming_window,
59
+ n_mels=audio_config["num_mels"],
60
+ ),
61
+ )
62
+
63
+ @torch.no_grad()
64
+ def inference(self, x, l2_norm=True):
65
+ return self.forward(x, l2_norm)
66
+
67
+ @torch.no_grad()
68
+ def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
69
+ """
70
+ Generate embeddings for a batch of utterances
71
+ x: 1xTxD
72
+ """
73
+ # map to the waveform size
74
+ if self.use_torch_spec:
75
+ num_frames = num_frames * self.audio_config["hop_length"]
76
+
77
+ max_len = x.shape[1]
78
+
79
+ if max_len < num_frames:
80
+ num_frames = max_len
81
+
82
+ offsets = np.linspace(0, max_len - num_frames, num=num_eval)
83
+
84
+ frames_batch = []
85
+ for offset in offsets:
86
+ offset = int(offset)
87
+ end_offset = int(offset + num_frames)
88
+ frames = x[:, offset:end_offset]
89
+ frames_batch.append(frames)
90
+
91
+ frames_batch = torch.cat(frames_batch, dim=0)
92
+ embeddings = self.inference(frames_batch, l2_norm=l2_norm)
93
+
94
+ if return_mean:
95
+ embeddings = torch.mean(embeddings, dim=0, keepdim=True)
96
+ return embeddings
97
+
98
+ def get_criterion(self, c: Coqpit, num_classes=None):
99
+ if c.loss == "ge2e":
100
+ criterion = GE2ELoss(loss_method="softmax")
101
+ elif c.loss == "angleproto":
102
+ criterion = AngleProtoLoss()
103
+ elif c.loss == "softmaxproto":
104
+ criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
105
+ else:
106
+ raise Exception("The %s not is a loss supported" % c.loss)
107
+ return criterion
108
+
109
+ def load_checkpoint(
110
+ self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None
111
+ ):
112
+ state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
113
+ try:
114
+ self.load_state_dict(state["model"])
115
+ except (KeyError, RuntimeError) as error:
116
+ # If eval raise the error
117
+ if eval:
118
+ raise error
119
+
120
+ print(" > Partial model initialization.")
121
+ model_dict = self.state_dict()
122
+ model_dict = set_init_dict(model_dict, state["model"], c)
123
+ self.load_state_dict(model_dict)
124
+ del model_dict
125
+
126
+ # load the criterion for restore_path
127
+ if criterion is not None and "criterion" in state:
128
+ try:
129
+ criterion.load_state_dict(state["criterion"])
130
+ except (KeyError, RuntimeError) as error:
131
+ print(" > Criterion load ignored because of:", error)
132
+
133
+ # instance and load the criterion for the encoder classifier in inference time
134
+ if (
135
+ eval
136
+ and criterion is None
137
+ and "criterion" in state
138
+ and getattr(config, "map_classid_to_classname", None) is not None
139
+ ):
140
+ criterion = self.get_criterion(config, len(config.map_classid_to_classname))
141
+ criterion.load_state_dict(state["criterion"])
142
+
143
+ if use_cuda:
144
+ self.cuda()
145
+ if criterion is not None:
146
+ criterion = criterion.cuda()
147
+
148
+ if eval:
149
+ self.eval()
150
+ assert not self.training
151
+
152
+ if not eval:
153
+ return criterion, state["step"]
154
+ return criterion
TTS/encoder/models/lstm.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ from TTS.encoder.models.base_encoder import BaseEncoder
5
+
6
+
7
+ class LSTMWithProjection(nn.Module):
8
+ def __init__(self, input_size, hidden_size, proj_size):
9
+ super().__init__()
10
+ self.input_size = input_size
11
+ self.hidden_size = hidden_size
12
+ self.proj_size = proj_size
13
+ self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
14
+ self.linear = nn.Linear(hidden_size, proj_size, bias=False)
15
+
16
+ def forward(self, x):
17
+ self.lstm.flatten_parameters()
18
+ o, (_, _) = self.lstm(x)
19
+ return self.linear(o)
20
+
21
+
22
+ class LSTMWithoutProjection(nn.Module):
23
+ def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
24
+ super().__init__()
25
+ self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
26
+ self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
27
+ self.relu = nn.ReLU()
28
+
29
+ def forward(self, x):
30
+ _, (hidden, _) = self.lstm(x)
31
+ return self.relu(self.linear(hidden[-1]))
32
+
33
+
34
+ class LSTMSpeakerEncoder(BaseEncoder):
35
+ def __init__(
36
+ self,
37
+ input_dim,
38
+ proj_dim=256,
39
+ lstm_dim=768,
40
+ num_lstm_layers=3,
41
+ use_lstm_with_projection=True,
42
+ use_torch_spec=False,
43
+ audio_config=None,
44
+ ):
45
+ super().__init__()
46
+ self.use_lstm_with_projection = use_lstm_with_projection
47
+ self.use_torch_spec = use_torch_spec
48
+ self.audio_config = audio_config
49
+ self.proj_dim = proj_dim
50
+
51
+ layers = []
52
+ # choise LSTM layer
53
+ if use_lstm_with_projection:
54
+ layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
55
+ for _ in range(num_lstm_layers - 1):
56
+ layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
57
+ self.layers = nn.Sequential(*layers)
58
+ else:
59
+ self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
60
+
61
+ self.instancenorm = nn.InstanceNorm1d(input_dim)
62
+
63
+ if self.use_torch_spec:
64
+ self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
65
+ else:
66
+ self.torch_spec = None
67
+
68
+ self._init_layers()
69
+
70
+ def _init_layers(self):
71
+ for name, param in self.layers.named_parameters():
72
+ if "bias" in name:
73
+ nn.init.constant_(param, 0.0)
74
+ elif "weight" in name:
75
+ nn.init.xavier_normal_(param)
76
+
77
+ def forward(self, x, l2_norm=True):
78
+ """Forward pass of the model.
79
+
80
+ Args:
81
+ x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
82
+ to compute the spectrogram on-the-fly.
83
+ l2_norm (bool): Whether to L2-normalize the outputs.
84
+
85
+ Shapes:
86
+ - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
87
+ """
88
+ with torch.no_grad():
89
+ with torch.cuda.amp.autocast(enabled=False):
90
+ if self.use_torch_spec:
91
+ x.squeeze_(1)
92
+ x = self.torch_spec(x)
93
+ x = self.instancenorm(x).transpose(1, 2)
94
+ d = self.layers(x)
95
+ if self.use_lstm_with_projection:
96
+ d = d[:, -1]
97
+ if l2_norm:
98
+ d = torch.nn.functional.normalize(d, p=2, dim=1)
99
+ return d
TTS/encoder/models/resnet.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ # from TTS.utils.audio import TorchSTFT
5
+ from TTS.encoder.models.base_encoder import BaseEncoder
6
+
7
+
8
+ class SELayer(nn.Module):
9
+ def __init__(self, channel, reduction=8):
10
+ super(SELayer, self).__init__()
11
+ self.avg_pool = nn.AdaptiveAvgPool2d(1)
12
+ self.fc = nn.Sequential(
13
+ nn.Linear(channel, channel // reduction),
14
+ nn.ReLU(inplace=True),
15
+ nn.Linear(channel // reduction, channel),
16
+ nn.Sigmoid(),
17
+ )
18
+
19
+ def forward(self, x):
20
+ b, c, _, _ = x.size()
21
+ y = self.avg_pool(x).view(b, c)
22
+ y = self.fc(y).view(b, c, 1, 1)
23
+ return x * y
24
+
25
+
26
+ class SEBasicBlock(nn.Module):
27
+ expansion = 1
28
+
29
+ def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
30
+ super(SEBasicBlock, self).__init__()
31
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
32
+ self.bn1 = nn.BatchNorm2d(planes)
33
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
34
+ self.bn2 = nn.BatchNorm2d(planes)
35
+ self.relu = nn.ReLU(inplace=True)
36
+ self.se = SELayer(planes, reduction)
37
+ self.downsample = downsample
38
+ self.stride = stride
39
+
40
+ def forward(self, x):
41
+ residual = x
42
+
43
+ out = self.conv1(x)
44
+ out = self.relu(out)
45
+ out = self.bn1(out)
46
+
47
+ out = self.conv2(out)
48
+ out = self.bn2(out)
49
+ out = self.se(out)
50
+
51
+ if self.downsample is not None:
52
+ residual = self.downsample(x)
53
+
54
+ out += residual
55
+ out = self.relu(out)
56
+ return out
57
+
58
+
59
+ class ResNetSpeakerEncoder(BaseEncoder):
60
+ """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
61
+ Adapted from: https://github.com/clovaai/voxceleb_trainer
62
+ """
63
+
64
+ # pylint: disable=W0102
65
+ def __init__(
66
+ self,
67
+ input_dim=64,
68
+ proj_dim=512,
69
+ layers=[3, 4, 6, 3],
70
+ num_filters=[32, 64, 128, 256],
71
+ encoder_type="ASP",
72
+ log_input=False,
73
+ use_torch_spec=False,
74
+ audio_config=None,
75
+ ):
76
+ super(ResNetSpeakerEncoder, self).__init__()
77
+
78
+ self.encoder_type = encoder_type
79
+ self.input_dim = input_dim
80
+ self.log_input = log_input
81
+ self.use_torch_spec = use_torch_spec
82
+ self.audio_config = audio_config
83
+ self.proj_dim = proj_dim
84
+
85
+ self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
86
+ self.relu = nn.ReLU(inplace=True)
87
+ self.bn1 = nn.BatchNorm2d(num_filters[0])
88
+
89
+ self.inplanes = num_filters[0]
90
+ self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
91
+ self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
92
+ self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
93
+ self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
94
+
95
+ self.instancenorm = nn.InstanceNorm1d(input_dim)
96
+
97
+ if self.use_torch_spec:
98
+ self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
99
+ else:
100
+ self.torch_spec = None
101
+
102
+ outmap_size = int(self.input_dim / 8)
103
+
104
+ self.attention = nn.Sequential(
105
+ nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
106
+ nn.ReLU(),
107
+ nn.BatchNorm1d(128),
108
+ nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
109
+ nn.Softmax(dim=2),
110
+ )
111
+
112
+ if self.encoder_type == "SAP":
113
+ out_dim = num_filters[3] * outmap_size
114
+ elif self.encoder_type == "ASP":
115
+ out_dim = num_filters[3] * outmap_size * 2
116
+ else:
117
+ raise ValueError("Undefined encoder")
118
+
119
+ self.fc = nn.Linear(out_dim, proj_dim)
120
+
121
+ self._init_layers()
122
+
123
+ def _init_layers(self):
124
+ for m in self.modules():
125
+ if isinstance(m, nn.Conv2d):
126
+ nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
127
+ elif isinstance(m, nn.BatchNorm2d):
128
+ nn.init.constant_(m.weight, 1)
129
+ nn.init.constant_(m.bias, 0)
130
+
131
+ def create_layer(self, block, planes, blocks, stride=1):
132
+ downsample = None
133
+ if stride != 1 or self.inplanes != planes * block.expansion:
134
+ downsample = nn.Sequential(
135
+ nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
136
+ nn.BatchNorm2d(planes * block.expansion),
137
+ )
138
+
139
+ layers = []
140
+ layers.append(block(self.inplanes, planes, stride, downsample))
141
+ self.inplanes = planes * block.expansion
142
+ for _ in range(1, blocks):
143
+ layers.append(block(self.inplanes, planes))
144
+
145
+ return nn.Sequential(*layers)
146
+
147
+ # pylint: disable=R0201
148
+ def new_parameter(self, *size):
149
+ out = nn.Parameter(torch.FloatTensor(*size))
150
+ nn.init.xavier_normal_(out)
151
+ return out
152
+
153
+ def forward(self, x, l2_norm=False):
154
+ """Forward pass of the model.
155
+
156
+ Args:
157
+ x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
158
+ to compute the spectrogram on-the-fly.
159
+ l2_norm (bool): Whether to L2-normalize the outputs.
160
+
161
+ Shapes:
162
+ - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
163
+ """
164
+ with torch.no_grad():
165
+ with torch.cuda.amp.autocast(enabled=False):
166
+ x.squeeze_(1)
167
+ # if you torch spec compute it otherwise use the mel spec computed by the AP
168
+ if self.use_torch_spec:
169
+ x = self.torch_spec(x)
170
+
171
+ if self.log_input:
172
+ x = (x + 1e-6).log()
173
+ x = self.instancenorm(x).unsqueeze(1)
174
+
175
+ x = self.conv1(x)
176
+ x = self.relu(x)
177
+ x = self.bn1(x)
178
+
179
+ x = self.layer1(x)
180
+ x = self.layer2(x)
181
+ x = self.layer3(x)
182
+ x = self.layer4(x)
183
+
184
+ x = x.reshape(x.size()[0], -1, x.size()[-1])
185
+
186
+ w = self.attention(x)
187
+
188
+ if self.encoder_type == "SAP":
189
+ x = torch.sum(x * w, dim=2)
190
+ elif self.encoder_type == "ASP":
191
+ mu = torch.sum(x * w, dim=2)
192
+ sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5))
193
+ x = torch.cat((mu, sg), 1)
194
+
195
+ x = x.view(x.size()[0], -1)
196
+ x = self.fc(x)
197
+
198
+ if l2_norm:
199
+ x = torch.nn.functional.normalize(x, p=2, dim=1)
200
+ return x
TTS/encoder/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ umap-learn
2
+ numpy>=1.17.0
TTS/encoder/utils/__init__.py ADDED
File without changes
TTS/encoder/utils/generic_utils.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import glob
3
+ import os
4
+ import random
5
+ import re
6
+
7
+ import numpy as np
8
+ from scipy import signal
9
+
10
+ from TTS.encoder.models.lstm import LSTMSpeakerEncoder
11
+ from TTS.encoder.models.resnet import ResNetSpeakerEncoder
12
+ from TTS.utils.io import save_fsspec
13
+
14
+
15
+ class AugmentWAV(object):
16
+ def __init__(self, ap, augmentation_config):
17
+
18
+ self.ap = ap
19
+ self.use_additive_noise = False
20
+
21
+ if "additive" in augmentation_config.keys():
22
+ self.additive_noise_config = augmentation_config["additive"]
23
+ additive_path = self.additive_noise_config["sounds_path"]
24
+ if additive_path:
25
+ self.use_additive_noise = True
26
+ # get noise types
27
+ self.additive_noise_types = []
28
+ for key in self.additive_noise_config.keys():
29
+ if isinstance(self.additive_noise_config[key], dict):
30
+ self.additive_noise_types.append(key)
31
+
32
+ additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
33
+
34
+ self.noise_list = {}
35
+
36
+ for wav_file in additive_files:
37
+ noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
38
+ # ignore not listed directories
39
+ if noise_dir not in self.additive_noise_types:
40
+ continue
41
+ if not noise_dir in self.noise_list:
42
+ self.noise_list[noise_dir] = []
43
+ self.noise_list[noise_dir].append(wav_file)
44
+
45
+ print(
46
+ f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
47
+ )
48
+
49
+ self.use_rir = False
50
+
51
+ if "rir" in augmentation_config.keys():
52
+ self.rir_config = augmentation_config["rir"]
53
+ if self.rir_config["rir_path"]:
54
+ self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
55
+ self.use_rir = True
56
+
57
+ print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
58
+
59
+ self.create_augmentation_global_list()
60
+
61
+ def create_augmentation_global_list(self):
62
+ if self.use_additive_noise:
63
+ self.global_noise_list = self.additive_noise_types
64
+ else:
65
+ self.global_noise_list = []
66
+ if self.use_rir:
67
+ self.global_noise_list.append("RIR_AUG")
68
+
69
+ def additive_noise(self, noise_type, audio):
70
+
71
+ clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
72
+
73
+ noise_list = random.sample(
74
+ self.noise_list[noise_type],
75
+ random.randint(
76
+ self.additive_noise_config[noise_type]["min_num_noises"],
77
+ self.additive_noise_config[noise_type]["max_num_noises"],
78
+ ),
79
+ )
80
+
81
+ audio_len = audio.shape[0]
82
+ noises_wav = None
83
+ for noise in noise_list:
84
+ noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
85
+
86
+ if noiseaudio.shape[0] < audio_len:
87
+ continue
88
+
89
+ noise_snr = random.uniform(
90
+ self.additive_noise_config[noise_type]["min_snr_in_db"],
91
+ self.additive_noise_config[noise_type]["max_num_noises"],
92
+ )
93
+ noise_db = 10 * np.log10(np.mean(noiseaudio**2) + 1e-4)
94
+ noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
95
+
96
+ if noises_wav is None:
97
+ noises_wav = noise_wav
98
+ else:
99
+ noises_wav += noise_wav
100
+
101
+ # if all possible files is less than audio, choose other files
102
+ if noises_wav is None:
103
+ return self.additive_noise(noise_type, audio)
104
+
105
+ return audio + noises_wav
106
+
107
+ def reverberate(self, audio):
108
+ audio_len = audio.shape[0]
109
+
110
+ rir_file = random.choice(self.rir_files)
111
+ rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
112
+ rir = rir / np.sqrt(np.sum(rir**2))
113
+ return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
114
+
115
+ def apply_one(self, audio):
116
+ noise_type = random.choice(self.global_noise_list)
117
+ if noise_type == "RIR_AUG":
118
+ return self.reverberate(audio)
119
+
120
+ return self.additive_noise(noise_type, audio)
121
+
122
+
123
+ def to_camel(text):
124
+ text = text.capitalize()
125
+ return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
126
+
127
+
128
+ def setup_encoder_model(config: "Coqpit"):
129
+ if config.model_params["model_name"].lower() == "lstm":
130
+ model = LSTMSpeakerEncoder(
131
+ config.model_params["input_dim"],
132
+ config.model_params["proj_dim"],
133
+ config.model_params["lstm_dim"],
134
+ config.model_params["num_lstm_layers"],
135
+ use_torch_spec=config.model_params.get("use_torch_spec", False),
136
+ audio_config=config.audio,
137
+ )
138
+ elif config.model_params["model_name"].lower() == "resnet":
139
+ model = ResNetSpeakerEncoder(
140
+ input_dim=config.model_params["input_dim"],
141
+ proj_dim=config.model_params["proj_dim"],
142
+ log_input=config.model_params.get("log_input", False),
143
+ use_torch_spec=config.model_params.get("use_torch_spec", False),
144
+ audio_config=config.audio,
145
+ )
146
+ return model
147
+
148
+
149
+ def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
150
+ checkpoint_path = "checkpoint_{}.pth".format(current_step)
151
+ checkpoint_path = os.path.join(out_path, checkpoint_path)
152
+ print(" | | > Checkpoint saving : {}".format(checkpoint_path))
153
+
154
+ new_state_dict = model.state_dict()
155
+ state = {
156
+ "model": new_state_dict,
157
+ "optimizer": optimizer.state_dict() if optimizer is not None else None,
158
+ "criterion": criterion.state_dict(),
159
+ "step": current_step,
160
+ "epoch": epoch,
161
+ "loss": model_loss,
162
+ "date": datetime.date.today().strftime("%B %d, %Y"),
163
+ }
164
+ save_fsspec(state, checkpoint_path)
165
+
166
+
167
+ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
168
+ if model_loss < best_loss:
169
+ new_state_dict = model.state_dict()
170
+ state = {
171
+ "model": new_state_dict,
172
+ "optimizer": optimizer.state_dict(),
173
+ "criterion": criterion.state_dict(),
174
+ "step": current_step,
175
+ "epoch": epoch,
176
+ "loss": model_loss,
177
+ "date": datetime.date.today().strftime("%B %d, %Y"),
178
+ }
179
+ best_loss = model_loss
180
+ bestmodel_path = "best_model.pth"
181
+ bestmodel_path = os.path.join(out_path, bestmodel_path)
182
+ print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
183
+ save_fsspec(state, bestmodel_path)
184
+ return best_loss
TTS/encoder/utils/io.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import os
3
+
4
+ from TTS.utils.io import save_fsspec
5
+
6
+
7
+ def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
8
+ checkpoint_path = "checkpoint_{}.pth".format(current_step)
9
+ checkpoint_path = os.path.join(out_path, checkpoint_path)
10
+ print(" | | > Checkpoint saving : {}".format(checkpoint_path))
11
+
12
+ new_state_dict = model.state_dict()
13
+ state = {
14
+ "model": new_state_dict,
15
+ "optimizer": optimizer.state_dict() if optimizer is not None else None,
16
+ "step": current_step,
17
+ "loss": model_loss,
18
+ "date": datetime.date.today().strftime("%B %d, %Y"),
19
+ }
20
+ save_fsspec(state, checkpoint_path)
21
+
22
+
23
+ def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step):
24
+ if model_loss < best_loss:
25
+ new_state_dict = model.state_dict()
26
+ state = {
27
+ "model": new_state_dict,
28
+ "optimizer": optimizer.state_dict(),
29
+ "step": current_step,
30
+ "loss": model_loss,
31
+ "date": datetime.date.today().strftime("%B %d, %Y"),
32
+ }
33
+ best_loss = model_loss
34
+ bestmodel_path = "best_model.pth"
35
+ bestmodel_path = os.path.join(out_path, bestmodel_path)
36
+ print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
37
+ save_fsspec(state, bestmodel_path)
38
+ return best_loss
TTS/encoder/utils/prepare_voxceleb.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo
3
+ # All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ==============================================================================
17
+ # Only support eager mode and TF>=2.0.0
18
+ # pylint: disable=no-member, invalid-name, relative-beyond-top-level
19
+ # pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
20
+ """ voxceleb 1 & 2 """
21
+
22
+ import hashlib
23
+ import os
24
+ import subprocess
25
+ import sys
26
+ import zipfile
27
+
28
+ import pandas
29
+ import soundfile as sf
30
+ from absl import logging
31
+
32
+ SUBSETS = {
33
+ "vox1_dev_wav": [
34
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
35
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab",
36
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac",
37
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad",
38
+ ],
39
+ "vox1_test_wav": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"],
40
+ "vox2_dev_aac": [
41
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa",
42
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab",
43
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac",
44
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad",
45
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae",
46
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf",
47
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag",
48
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah",
49
+ ],
50
+ "vox2_test_aac": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"],
51
+ }
52
+
53
+ MD5SUM = {
54
+ "vox1_dev_wav": "ae63e55b951748cc486645f532ba230b",
55
+ "vox2_dev_aac": "bbc063c46078a602ca71605645c2a402",
56
+ "vox1_test_wav": "185fdc63c3c739954633d50379a3d102",
57
+ "vox2_test_aac": "0d2b3ea430a821c33263b5ea37ede312",
58
+ }
59
+
60
+ USER = {"user": "", "password": ""}
61
+
62
+ speaker_id_dict = {}
63
+
64
+
65
+ def download_and_extract(directory, subset, urls):
66
+ """Download and extract the given split of dataset.
67
+
68
+ Args:
69
+ directory: the directory where to put the downloaded data.
70
+ subset: subset name of the corpus.
71
+ urls: the list of urls to download the data file.
72
+ """
73
+ os.makedirs(directory, exist_ok=True)
74
+
75
+ try:
76
+ for url in urls:
77
+ zip_filepath = os.path.join(directory, url.split("/")[-1])
78
+ if os.path.exists(zip_filepath):
79
+ continue
80
+ logging.info("Downloading %s to %s" % (url, zip_filepath))
81
+ subprocess.call(
82
+ "wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
83
+ shell=True,
84
+ )
85
+
86
+ statinfo = os.stat(zip_filepath)
87
+ logging.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
88
+
89
+ # concatenate all parts into zip files
90
+ if ".zip" not in zip_filepath:
91
+ zip_filepath = "_".join(zip_filepath.split("_")[:-1])
92
+ subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True)
93
+ zip_filepath += ".zip"
94
+ extract_path = zip_filepath.strip(".zip")
95
+
96
+ # check zip file md5sum
97
+ with open(zip_filepath, "rb") as f_zip:
98
+ md5 = hashlib.md5(f_zip.read()).hexdigest()
99
+ if md5 != MD5SUM[subset]:
100
+ raise ValueError("md5sum of %s mismatch" % zip_filepath)
101
+
102
+ with zipfile.ZipFile(zip_filepath, "r") as zfile:
103
+ zfile.extractall(directory)
104
+ extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename)
105
+ subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True)
106
+ finally:
107
+ # os.remove(zip_filepath)
108
+ pass
109
+
110
+
111
+ def exec_cmd(cmd):
112
+ """Run a command in a subprocess.
113
+ Args:
114
+ cmd: command line to be executed.
115
+ Return:
116
+ int, the return code.
117
+ """
118
+ try:
119
+ retcode = subprocess.call(cmd, shell=True)
120
+ if retcode < 0:
121
+ logging.info(f"Child was terminated by signal {retcode}")
122
+ except OSError as e:
123
+ logging.info(f"Execution failed: {e}")
124
+ retcode = -999
125
+ return retcode
126
+
127
+
128
+ def decode_aac_with_ffmpeg(aac_file, wav_file):
129
+ """Decode a given AAC file into WAV using ffmpeg.
130
+ Args:
131
+ aac_file: file path to input AAC file.
132
+ wav_file: file path to output WAV file.
133
+ Return:
134
+ bool, True if success.
135
+ """
136
+ cmd = f"ffmpeg -i {aac_file} {wav_file}"
137
+ logging.info(f"Decoding aac file using command line: {cmd}")
138
+ ret = exec_cmd(cmd)
139
+ if ret != 0:
140
+ logging.error(f"Failed to decode aac file with retcode {ret}")
141
+ logging.error("Please check your ffmpeg installation.")
142
+ return False
143
+ return True
144
+
145
+
146
+ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
147
+ """Optionally convert AAC to WAV and make speaker labels.
148
+ Args:
149
+ input_dir: the directory which holds the input dataset.
150
+ subset: the name of the specified subset. e.g. vox1_dev_wav
151
+ output_dir: the directory to place the newly generated csv files.
152
+ output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
153
+ """
154
+
155
+ logging.info("Preprocessing audio and label for subset %s" % subset)
156
+ source_dir = os.path.join(input_dir, subset)
157
+
158
+ files = []
159
+ # Convert all AAC file into WAV format. At the same time, generate the csv
160
+ for root, _, filenames in os.walk(source_dir):
161
+ for filename in filenames:
162
+ name, ext = os.path.splitext(filename)
163
+ if ext.lower() == ".wav":
164
+ _, ext2 = os.path.splitext(name)
165
+ if ext2:
166
+ continue
167
+ wav_file = os.path.join(root, filename)
168
+ elif ext.lower() == ".m4a":
169
+ # Convert AAC to WAV.
170
+ aac_file = os.path.join(root, filename)
171
+ wav_file = aac_file + ".wav"
172
+ if not os.path.exists(wav_file):
173
+ if not decode_aac_with_ffmpeg(aac_file, wav_file):
174
+ raise RuntimeError("Audio decoding failed.")
175
+ else:
176
+ continue
177
+ speaker_name = root.split(os.path.sep)[-2]
178
+ if speaker_name not in speaker_id_dict:
179
+ num = len(speaker_id_dict)
180
+ speaker_id_dict[speaker_name] = num
181
+ # wav_filesize = os.path.getsize(wav_file)
182
+ wav_length = len(sf.read(wav_file)[0])
183
+ files.append((os.path.abspath(wav_file), wav_length, speaker_id_dict[speaker_name], speaker_name))
184
+
185
+ # Write to CSV file which contains four columns:
186
+ # "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
187
+ csv_file_path = os.path.join(output_dir, output_file)
188
+ df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
189
+ df.to_csv(csv_file_path, index=False, sep="\t")
190
+ logging.info("Successfully generated csv file {}".format(csv_file_path))
191
+
192
+
193
+ def processor(directory, subset, force_process):
194
+ """download and process"""
195
+ urls = SUBSETS
196
+ if subset not in urls:
197
+ raise ValueError(subset, "is not in voxceleb")
198
+
199
+ subset_csv = os.path.join(directory, subset + ".csv")
200
+ if not force_process and os.path.exists(subset_csv):
201
+ return subset_csv
202
+
203
+ logging.info("Downloading and process the voxceleb in %s", directory)
204
+ logging.info("Preparing subset %s", subset)
205
+ download_and_extract(directory, subset, urls[subset])
206
+ convert_audio_and_make_label(directory, subset, directory, subset + ".csv")
207
+ logging.info("Finished downloading and processing")
208
+ return subset_csv
209
+
210
+
211
+ if __name__ == "__main__":
212
+ logging.set_verbosity(logging.INFO)
213
+ if len(sys.argv) != 4:
214
+ print("Usage: python prepare_data.py save_directory user password")
215
+ sys.exit()
216
+
217
+ DIR, USER["user"], USER["password"] = sys.argv[1], sys.argv[2], sys.argv[3]
218
+ for SUBSET in SUBSETS:
219
+ processor(DIR, SUBSET, False)
TTS/encoder/utils/samplers.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ from torch.utils.data.sampler import Sampler, SubsetRandomSampler
4
+
5
+
6
+ class SubsetSampler(Sampler):
7
+ """
8
+ Samples elements sequentially from a given list of indices.
9
+
10
+ Args:
11
+ indices (list): a sequence of indices
12
+ """
13
+
14
+ def __init__(self, indices):
15
+ super().__init__(indices)
16
+ self.indices = indices
17
+
18
+ def __iter__(self):
19
+ return (self.indices[i] for i in range(len(self.indices)))
20
+
21
+ def __len__(self):
22
+ return len(self.indices)
23
+
24
+
25
+ class PerfectBatchSampler(Sampler):
26
+ """
27
+ Samples a mini-batch of indices for a balanced class batching
28
+
29
+ Args:
30
+ dataset_items(list): dataset items to sample from.
31
+ classes (list): list of classes of dataset_items to sample from.
32
+ batch_size (int): total number of samples to be sampled in a mini-batch.
33
+ num_gpus (int): number of GPU in the data parallel mode.
34
+ shuffle (bool): if True, samples randomly, otherwise samples sequentially.
35
+ drop_last (bool): if True, drops last incomplete batch.
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ dataset_items,
41
+ classes,
42
+ batch_size,
43
+ num_classes_in_batch,
44
+ num_gpus=1,
45
+ shuffle=True,
46
+ drop_last=False,
47
+ label_key="class_name",
48
+ ):
49
+ super().__init__(dataset_items)
50
+ assert (
51
+ batch_size % (num_classes_in_batch * num_gpus) == 0
52
+ ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."
53
+
54
+ label_indices = {}
55
+ for idx, item in enumerate(dataset_items):
56
+ label = item[label_key]
57
+ if label not in label_indices.keys():
58
+ label_indices[label] = [idx]
59
+ else:
60
+ label_indices[label].append(idx)
61
+
62
+ if shuffle:
63
+ self._samplers = [SubsetRandomSampler(label_indices[key]) for key in classes]
64
+ else:
65
+ self._samplers = [SubsetSampler(label_indices[key]) for key in classes]
66
+
67
+ self._batch_size = batch_size
68
+ self._drop_last = drop_last
69
+ self._dp_devices = num_gpus
70
+ self._num_classes_in_batch = num_classes_in_batch
71
+
72
+ def __iter__(self):
73
+
74
+ batch = []
75
+ if self._num_classes_in_batch != len(self._samplers):
76
+ valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)
77
+ else:
78
+ valid_samplers_idx = None
79
+
80
+ iters = [iter(s) for s in self._samplers]
81
+ done = False
82
+
83
+ while True:
84
+ b = []
85
+ for i, it in enumerate(iters):
86
+ if valid_samplers_idx is not None and i not in valid_samplers_idx:
87
+ continue
88
+ idx = next(it, None)
89
+ if idx is None:
90
+ done = True
91
+ break
92
+ b.append(idx)
93
+ if done:
94
+ break
95
+ batch += b
96
+ if len(batch) == self._batch_size:
97
+ yield batch
98
+ batch = []
99
+ if valid_samplers_idx is not None:
100
+ valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)
101
+
102
+ if not self._drop_last:
103
+ if len(batch) > 0:
104
+ groups = len(batch) // self._num_classes_in_batch
105
+ if groups % self._dp_devices == 0:
106
+ yield batch
107
+ else:
108
+ batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
109
+ if len(batch) > 0:
110
+ yield batch
111
+
112
+ def __len__(self):
113
+ class_batch_size = self._batch_size // self._num_classes_in_batch
114
+ return min(((len(s) + class_batch_size - 1) // class_batch_size) for s in self._samplers)
TTS/encoder/utils/training.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+
4
+ from coqpit import Coqpit
5
+ from trainer import TrainerArgs, get_last_checkpoint
6
+ from trainer.logging import logger_factory
7
+ from trainer.logging.console_logger import ConsoleLogger
8
+
9
+ from TTS.config import load_config, register_config
10
+ from TTS.tts.utils.text.characters import parse_symbols
11
+ from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
12
+ from TTS.utils.io import copy_model_files
13
+
14
+
15
+ @dataclass
16
+ class TrainArgs(TrainerArgs):
17
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
18
+
19
+
20
+ def getarguments():
21
+ train_config = TrainArgs()
22
+ parser = train_config.init_argparse(arg_prefix="")
23
+ return parser
24
+
25
+
26
+ def process_args(args, config=None):
27
+ """Process parsed comand line arguments and initialize the config if not provided.
28
+ Args:
29
+ args (argparse.Namespace or dict like): Parsed input arguments.
30
+ config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
31
+ Returns:
32
+ c (TTS.utils.io.AttrDict): Config paramaters.
33
+ out_path (str): Path to save models and logging.
34
+ audio_path (str): Path to save generated test audios.
35
+ c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
36
+ logging to the console.
37
+ dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging
38
+ TODO:
39
+ - Interactive config definition.
40
+ """
41
+ if isinstance(args, tuple):
42
+ args, coqpit_overrides = args
43
+ if args.continue_path:
44
+ # continue a previous training from its output folder
45
+ experiment_path = args.continue_path
46
+ args.config_path = os.path.join(args.continue_path, "config.json")
47
+ args.restore_path, best_model = get_last_checkpoint(args.continue_path)
48
+ if not args.best_path:
49
+ args.best_path = best_model
50
+ # init config if not already defined
51
+ if config is None:
52
+ if args.config_path:
53
+ # init from a file
54
+ config = load_config(args.config_path)
55
+ else:
56
+ # init from console args
57
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
58
+
59
+ config_base = BaseTrainingConfig()
60
+ config_base.parse_known_args(coqpit_overrides)
61
+ config = register_config(config_base.model)()
62
+ # override values from command-line args
63
+ config.parse_known_args(coqpit_overrides, relaxed_parser=True)
64
+ experiment_path = args.continue_path
65
+ if not experiment_path:
66
+ experiment_path = get_experiment_folder_path(config.output_path, config.run_name)
67
+ audio_path = os.path.join(experiment_path, "test_audios")
68
+ config.output_log_path = experiment_path
69
+ # setup rank 0 process in distributed training
70
+ dashboard_logger = None
71
+ if args.rank == 0:
72
+ new_fields = {}
73
+ if args.restore_path:
74
+ new_fields["restore_path"] = args.restore_path
75
+ new_fields["github_branch"] = get_git_branch()
76
+ # if model characters are not set in the config file
77
+ # save the default set to the config file for future
78
+ # compatibility.
79
+ if config.has("characters") and config.characters is None:
80
+ used_characters = parse_symbols()
81
+ new_fields["characters"] = used_characters
82
+ copy_model_files(config, experiment_path, new_fields)
83
+ dashboard_logger = logger_factory(config, experiment_path)
84
+ c_logger = ConsoleLogger()
85
+ return config, experiment_path, audio_path, c_logger, dashboard_logger
86
+
87
+
88
+ def init_arguments():
89
+ train_config = TrainArgs()
90
+ parser = train_config.init_argparse(arg_prefix="")
91
+ return parser
92
+
93
+
94
+ def init_training(config: Coqpit = None):
95
+ """Initialization of a training run."""
96
+ parser = init_arguments()
97
+ args = parser.parse_known_args()
98
+ config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args(args, config)
99
+ return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger
TTS/encoder/utils/visual.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+ import umap
5
+
6
+ matplotlib.use("Agg")
7
+
8
+
9
+ colormap = (
10
+ np.array(
11
+ [
12
+ [76, 255, 0],
13
+ [0, 127, 70],
14
+ [255, 0, 0],
15
+ [255, 217, 38],
16
+ [0, 135, 255],
17
+ [165, 0, 165],
18
+ [255, 167, 255],
19
+ [0, 255, 255],
20
+ [255, 96, 38],
21
+ [142, 76, 0],
22
+ [33, 0, 127],
23
+ [0, 0, 0],
24
+ [183, 183, 183],
25
+ ],
26
+ dtype=np.float,
27
+ )
28
+ / 255
29
+ )
30
+
31
+
32
+ def plot_embeddings(embeddings, num_classes_in_batch):
33
+ num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
34
+
35
+ # if necessary get just the first 10 classes
36
+ if num_classes_in_batch > 10:
37
+ num_classes_in_batch = 10
38
+ embeddings = embeddings[: num_classes_in_batch * num_utter_per_class]
39
+
40
+ model = umap.UMAP()
41
+ projection = model.fit_transform(embeddings)
42
+ ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class)
43
+ colors = [colormap[i] for i in ground_truth]
44
+ fig, ax = plt.subplots(figsize=(16, 10))
45
+ _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
46
+ plt.gca().set_aspect("equal", "datalim")
47
+ plt.title("UMAP projection")
48
+ plt.tight_layout()
49
+ plt.savefig("umap")
50
+ return fig
TTS/model.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod
2
+ from typing import Dict
3
+
4
+ import torch
5
+ from coqpit import Coqpit
6
+ from trainer import TrainerModel
7
+
8
+ # pylint: skip-file
9
+
10
+
11
+ class BaseTrainerModel(TrainerModel):
12
+ """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
13
+
14
+ Every new 🐸TTS model must inherit it.
15
+ """
16
+
17
+ @staticmethod
18
+ @abstractmethod
19
+ def init_from_config(config: Coqpit):
20
+ """Init the model and all its attributes from the given config.
21
+
22
+ Override this depending on your model.
23
+ """
24
+ ...
25
+
26
+ @abstractmethod
27
+ def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
28
+ """Forward pass for inference.
29
+
30
+ It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
31
+ is considered to be the main output and you can add any other auxiliary outputs as you want.
32
+
33
+ We don't use `*kwargs` since it is problematic with the TorchScript API.
34
+
35
+ Args:
36
+ input (torch.Tensor): [description]
37
+ aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc.
38
+
39
+ Returns:
40
+ Dict: [description]
41
+ """
42
+ outputs_dict = {"model_outputs": None}
43
+ ...
44
+ return outputs_dict
45
+
46
+ @abstractmethod
47
+ def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
48
+ """Load a model checkpoint gile and get ready for training or inference.
49
+
50
+ Args:
51
+ config (Coqpit): Model configuration.
52
+ checkpoint_path (str): Path to the model checkpoint file.
53
+ eval (bool, optional): If true, init model for inference else for training. Defaults to False.
54
+ strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
55
+ """
56
+ ...
TTS/server/README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # :frog: TTS demo server
2
+ Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below.
3
+
4
+ **Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
5
+
6
+ Examples runs:
7
+
8
+ List officially released models.
9
+ ```python TTS/server/server.py --list_models ```
10
+
11
+ Run the server with the official models.
12
+ ```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
13
+
14
+ Run the server with the official models on a GPU.
15
+ ```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
16
+
17
+ Run the server with a custom models.
18
+ ```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
TTS/server/__init__.py ADDED
File without changes
TTS/server/conf.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
3
+ "tts_file":"best_model.pth", // tts checkpoint file
4
+ "tts_config":"config.json", // tts config.json file
5
+ "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
6
+ "vocoder_config":null,
7
+ "vocoder_file": null,
8
+ "is_wavernn_batched":true,
9
+ "port": 5002,
10
+ "use_cuda": true,
11
+ "debug": true
12
+ }
TTS/server/server.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!flask/bin/python
2
+ import argparse
3
+ import io
4
+ import json
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import Union
9
+
10
+ from flask import Flask, render_template, request, send_file
11
+
12
+ from TTS.config import load_config
13
+ from TTS.utils.manage import ModelManager
14
+ from TTS.utils.synthesizer import Synthesizer
15
+
16
+
17
+ def create_argparser():
18
+ def convert_boolean(x):
19
+ return x.lower() in ["true", "1", "yes"]
20
+
21
+ parser = argparse.ArgumentParser()
22
+ parser.add_argument(
23
+ "--list_models",
24
+ type=convert_boolean,
25
+ nargs="?",
26
+ const=True,
27
+ default=False,
28
+ help="list available pre-trained tts and vocoder models.",
29
+ )
30
+ parser.add_argument(
31
+ "--model_name",
32
+ type=str,
33
+ default="tts_models/en/ljspeech/tacotron2-DDC",
34
+ help="Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
35
+ )
36
+ parser.add_argument("--vocoder_name", type=str, default=None, help="name of one of the released vocoder models.")
37
+
38
+ # Args for running custom models
39
+ parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
40
+ parser.add_argument(
41
+ "--model_path",
42
+ type=str,
43
+ default=None,
44
+ help="Path to model file.",
45
+ )
46
+ parser.add_argument(
47
+ "--vocoder_path",
48
+ type=str,
49
+ help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
50
+ default=None,
51
+ )
52
+ parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
53
+ parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
54
+ parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
55
+ parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.")
56
+ parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.")
57
+ parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.")
58
+ return parser
59
+
60
+
61
+ # parse the args
62
+ args = create_argparser().parse_args()
63
+
64
+ path = Path(__file__).parent / "../.models.json"
65
+ manager = ModelManager(path)
66
+
67
+ if args.list_models:
68
+ manager.list_models()
69
+ sys.exit()
70
+
71
+ # update in-use models to the specified released models.
72
+ model_path = None
73
+ config_path = None
74
+ speakers_file_path = None
75
+ vocoder_path = None
76
+ vocoder_config_path = None
77
+
78
+ # CASE1: list pre-trained TTS models
79
+ if args.list_models:
80
+ manager.list_models()
81
+ sys.exit()
82
+
83
+ # CASE2: load pre-trained model paths
84
+ if args.model_name is not None and not args.model_path:
85
+ model_path, config_path, model_item = manager.download_model(args.model_name)
86
+ args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
87
+
88
+ if args.vocoder_name is not None and not args.vocoder_path:
89
+ vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
90
+
91
+ # CASE3: set custom model paths
92
+ if args.model_path is not None:
93
+ model_path = args.model_path
94
+ config_path = args.config_path
95
+ speakers_file_path = args.speakers_file_path
96
+
97
+ if args.vocoder_path is not None:
98
+ vocoder_path = args.vocoder_path
99
+ vocoder_config_path = args.vocoder_config_path
100
+
101
+ # load models
102
+ synthesizer = Synthesizer(
103
+ tts_checkpoint=model_path,
104
+ tts_config_path=config_path,
105
+ tts_speakers_file=speakers_file_path,
106
+ tts_languages_file=None,
107
+ vocoder_checkpoint=vocoder_path,
108
+ vocoder_config=vocoder_config_path,
109
+ encoder_checkpoint="",
110
+ encoder_config="",
111
+ use_cuda=args.use_cuda,
112
+ )
113
+
114
+ use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
115
+ synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
116
+ )
117
+
118
+ speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
119
+ # TODO: set this from SpeakerManager
120
+ use_gst = synthesizer.tts_config.get("use_gst", False)
121
+ app = Flask(__name__)
122
+
123
+
124
+ def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
125
+ """Transform an uri style_wav, in either a string (path to wav file to be use for style transfer)
126
+ or a dict (gst tokens/values to be use for styling)
127
+
128
+ Args:
129
+ style_wav (str): uri
130
+
131
+ Returns:
132
+ Union[str, dict]: path to file (str) or gst style (dict)
133
+ """
134
+ if style_wav:
135
+ if os.path.isfile(style_wav) and style_wav.endswith(".wav"):
136
+ return style_wav # style_wav is a .wav file located on the server
137
+
138
+ style_wav = json.loads(style_wav)
139
+ return style_wav # style_wav is a gst dictionary with {token1_id : token1_weigth, ...}
140
+ return None
141
+
142
+
143
+ @app.route("/")
144
+ def index():
145
+ return render_template(
146
+ "index.html",
147
+ show_details=args.show_details,
148
+ use_multi_speaker=use_multi_speaker,
149
+ speaker_ids=speaker_manager.ids if speaker_manager is not None else None,
150
+ use_gst=use_gst,
151
+ )
152
+
153
+
154
+ @app.route("/details")
155
+ def details():
156
+ model_config = load_config(args.tts_config)
157
+ if args.vocoder_config is not None and os.path.isfile(args.vocoder_config):
158
+ vocoder_config = load_config(args.vocoder_config)
159
+ else:
160
+ vocoder_config = None
161
+
162
+ return render_template(
163
+ "details.html",
164
+ show_details=args.show_details,
165
+ model_config=model_config,
166
+ vocoder_config=vocoder_config,
167
+ args=args.__dict__,
168
+ )
169
+
170
+
171
+ @app.route("/api/tts", methods=["GET"])
172
+ def tts():
173
+ text = request.args.get("text")
174
+ speaker_idx = request.args.get("speaker_id", "")
175
+ style_wav = request.args.get("style_wav", "")
176
+ style_wav = style_wav_uri_to_dict(style_wav)
177
+ print(" > Model input: {}".format(text))
178
+ print(" > Speaker Idx: {}".format(speaker_idx))
179
+ wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
180
+ out = io.BytesIO()
181
+ synthesizer.save_wav(wavs, out)
182
+ return send_file(out, mimetype="audio/wav")
183
+
184
+
185
+ def main():
186
+ app.run(debug=args.debug, host="::", port=args.port)
187
+
188
+
189
+ if __name__ == "__main__":
190
+ main()
TTS/server/static/coqui-log-green-TTS.png ADDED
TTS/server/templates/details.html ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+
6
+ <meta charset="utf-8">
7
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
8
+ <meta name="description" content="">
9
+ <meta name="author" content="">
10
+
11
+ <title>TTS engine</title>
12
+
13
+ <!-- Bootstrap core CSS -->
14
+ <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
15
+ integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous"
16
+ rel="stylesheet">
17
+
18
+ <!-- Custom styles for this template -->
19
+ <style>
20
+ body {
21
+ padding-top: 54px;
22
+ }
23
+
24
+ @media (min-width: 992px) {
25
+ body {
26
+ padding-top: 56px;
27
+ }
28
+ }
29
+ </style>
30
+ </head>
31
+
32
+ <body>
33
+ <a href="https://github.com/mozilla/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
34
+ src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
35
+
36
+ {% if show_details == true %}
37
+
38
+ <div class="container">
39
+ <b>Model details</b>
40
+ </div>
41
+
42
+ <div class="container">
43
+ <details>
44
+ <summary>CLI arguments:</summary>
45
+ <table border="1" align="center" width="75%">
46
+ <tr>
47
+ <td> CLI key </td>
48
+ <td> Value </td>
49
+ </tr>
50
+
51
+ {% for key, value in args.items() %}
52
+
53
+ <tr>
54
+ <td>{{ key }}</td>
55
+ <td>{{ value }}</td>
56
+ </tr>
57
+
58
+ {% endfor %}
59
+ </table>
60
+ </details>
61
+ </div></br>
62
+
63
+ <div class="container">
64
+
65
+ {% if model_config != None %}
66
+
67
+ <details>
68
+ <summary>Model config:</summary>
69
+
70
+ <table border="1" align="center" width="75%">
71
+ <tr>
72
+ <td> Key </td>
73
+ <td> Value </td>
74
+ </tr>
75
+
76
+
77
+ {% for key, value in model_config.items() %}
78
+
79
+ <tr>
80
+ <td>{{ key }}</td>
81
+ <td>{{ value }}</td>
82
+ </tr>
83
+
84
+ {% endfor %}
85
+
86
+ </table>
87
+ </details>
88
+
89
+ {% endif %}
90
+
91
+ </div></br>
92
+
93
+
94
+
95
+ <div class="container">
96
+ {% if vocoder_config != None %}
97
+ <details>
98
+ <summary>Vocoder model config:</summary>
99
+
100
+ <table border="1" align="center" width="75%">
101
+ <tr>
102
+ <td> Key </td>
103
+ <td> Value </td>
104
+ </tr>
105
+
106
+
107
+ {% for key, value in vocoder_config.items() %}
108
+
109
+ <tr>
110
+ <td>{{ key }}</td>
111
+ <td>{{ value }}</td>
112
+ </tr>
113
+
114
+ {% endfor %}
115
+
116
+
117
+ </table>
118
+ </details>
119
+ {% endif %}
120
+ </div></br>
121
+
122
+ {% else %}
123
+ <div class="container">
124
+ <b>Please start server with --show_details=true to see details.</b>
125
+ </div>
126
+
127
+ {% endif %}
128
+
129
+ </body>
130
+
131
+ </html>
TTS/server/templates/index.html ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+
6
+ <meta charset="utf-8">
7
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
8
+ <meta name="description" content="🐸Coqui AI TTS demo server.">
9
+ <meta name="author" content="🐸Coqui AI TTS">
10
+
11
+ <title>TTS engine</title>
12
+
13
+ <!-- Bootstrap core CSS -->
14
+ <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
15
+ integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous"
16
+ rel="stylesheet">
17
+
18
+ <!-- Custom styles for this template -->
19
+ <style>
20
+ body {
21
+ padding-top: 54px;
22
+ }
23
+
24
+ @media (min-width: 992px) {
25
+ body {
26
+ padding-top: 56px;
27
+ }
28
+ }
29
+ </style>
30
+ </head>
31
+
32
+ <body>
33
+ <a href="https://github.com/coqui-ai/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
34
+ src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
35
+
36
+ <!-- Navigation -->
37
+ <!--
38
+ <nav class="navbar navbar-expand-lg navbar-dark bg-dark fixed-top">
39
+ <div class="container">
40
+ <a class="navbar-brand" href="#">Coqui TTS</a>
41
+ <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
42
+ <span class="navbar-toggler-icon"></span>
43
+ </button>
44
+ <div class="collapse navbar-collapse" id="navbarResponsive">
45
+ <ul class="navbar-nav ml-auto">
46
+ <li class="nav-item active">
47
+ <a class="nav-link" href="#">Home
48
+ <span class="sr-only">(current)</span>
49
+ </a>
50
+ </li>
51
+ </ul>
52
+ </div>
53
+ </div>
54
+ </nav>
55
+ -->
56
+
57
+ <!-- Page Content -->
58
+ <div class="container">
59
+ <div class="row">
60
+ <div class="col-lg-12 text-center">
61
+ <img class="mt-5" src="{{url_for('static', filename='coqui-log-green-TTS.png')}}" align="middle"
62
+ width="512" />
63
+
64
+ <ul class="list-unstyled">
65
+ </ul>
66
+
67
+ {%if use_gst%}
68
+ <input value='{"0": 0.1}' id="style_wav" placeholder="style wav (dict or path ot wav).." size=45
69
+ type="text" name="style_wav">
70
+ {%endif%}
71
+
72
+ <input id="text" placeholder="Type here..." size=45 type="text" name="text">
73
+ <button id="speak-button" name="speak">Speak</button><br /><br />
74
+
75
+ {%if use_multi_speaker%}
76
+ Choose a speaker:
77
+ <select id="speaker_id" name=speaker_id method="GET" action="/">
78
+ {% for speaker_id in speaker_ids %}
79
+ <option value="{{speaker_id}}" SELECTED>{{speaker_id}}</option>"
80
+ {% endfor %}
81
+ </select><br /><br />
82
+ {%endif%}
83
+
84
+ {%if show_details%}
85
+ <button id="details-button" onclick="location.href = 'details'" name="model-details">Model
86
+ Details</button><br /><br />
87
+ {%endif%}
88
+ <audio id="audio" controls autoplay hidden></audio>
89
+ <p id="message"></p>
90
+ </div>
91
+ </div>
92
+ </div>
93
+
94
+ <!-- Bootstrap core JavaScript -->
95
+ <script>
96
+ function getTextValue(textId) {
97
+ const container = q(textId)
98
+ if (container) {
99
+ return container.value
100
+ }
101
+ return ""
102
+ }
103
+ function q(selector) { return document.querySelector(selector) }
104
+ q('#text').focus()
105
+ function do_tts(e) {
106
+ const text = q('#text').value
107
+ const speaker_id = getTextValue('#speaker_id')
108
+ const style_wav = getTextValue('#style_wav')
109
+ if (text) {
110
+ q('#message').textContent = 'Synthesizing...'
111
+ q('#speak-button').disabled = true
112
+ q('#audio').hidden = true
113
+ synthesize(text, speaker_id, style_wav)
114
+ }
115
+ e.preventDefault()
116
+ return false
117
+ }
118
+ q('#speak-button').addEventListener('click', do_tts)
119
+ q('#text').addEventListener('keyup', function (e) {
120
+ if (e.keyCode == 13) { // enter
121
+ do_tts(e)
122
+ }
123
+ })
124
+ function synthesize(text, speaker_id = "", style_wav = "") {
125
+ fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}`, { cache: 'no-cache' })
126
+ .then(function (res) {
127
+ if (!res.ok) throw Error(res.statusText)
128
+ return res.blob()
129
+ }).then(function (blob) {
130
+ q('#message').textContent = ''
131
+ q('#speak-button').disabled = false
132
+ q('#audio').src = URL.createObjectURL(blob)
133
+ q('#audio').hidden = false
134
+ }).catch(function (err) {
135
+ q('#message').textContent = 'Error: ' + err.message
136
+ q('#speak-button').disabled = false
137
+ })
138
+ }
139
+ </script>
140
+
141
+ </body>
142
+
143
+ </html>
TTS/tts/__init__.py ADDED
File without changes
TTS/tts/configs/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ import os
3
+ from inspect import isclass
4
+
5
+ # import all files under configs/
6
+ # configs_dir = os.path.dirname(__file__)
7
+ # for file in os.listdir(configs_dir):
8
+ # path = os.path.join(configs_dir, file)
9
+ # if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
10
+ # config_name = file[: file.find(".py")] if file.endswith(".py") else file
11
+ # module = importlib.import_module("TTS.tts.configs." + config_name)
12
+ # for attribute_name in dir(module):
13
+ # attribute = getattr(module, attribute_name)
14
+
15
+ # if isclass(attribute):
16
+ # # Add the class to this package's variables
17
+ # globals()[attribute_name] = attribute
TTS/tts/configs/align_tts_config.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from typing import List
3
+
4
+ from TTS.tts.configs.shared_configs import BaseTTSConfig
5
+ from TTS.tts.models.align_tts import AlignTTSArgs
6
+
7
+
8
+ @dataclass
9
+ class AlignTTSConfig(BaseTTSConfig):
10
+ """Defines parameters for AlignTTS model.
11
+ Example:
12
+
13
+ >>> from TTS.tts.configs.align_tts_config import AlignTTSConfig
14
+ >>> config = AlignTTSConfig()
15
+
16
+ Args:
17
+ model(str):
18
+ Model name used for selecting the right model at initialization. Defaults to `align_tts`.
19
+ positional_encoding (bool):
20
+ enable / disable positional encoding applied to the encoder output. Defaults to True.
21
+ hidden_channels (int):
22
+ Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
23
+ parameters. Defaults to 256.
24
+ hidden_channels_dp (int):
25
+ Number of hidden channels of the duration predictor's layers. Defaults to 256.
26
+ encoder_type (str):
27
+ Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
28
+ Defaults to `fftransformer`.
29
+ encoder_params (dict):
30
+ Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
31
+ Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
32
+ decoder_type (str):
33
+ Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
34
+ Defaults to `fftransformer`.
35
+ decoder_params (dict):
36
+ Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
37
+ Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
38
+ phase_start_steps (List[int]):
39
+ A list of number of steps required to start the next training phase. AlignTTS has 4 different training
40
+ phases. Thus you need to define 4 different values to enable phase based training. If None, it
41
+ trains the whole model together. Defaults to None.
42
+ ssim_alpha (float):
43
+ Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
44
+ duration_loss_alpha (float):
45
+ Weight for the duration predictor's loss. Defaults to 1.0.
46
+ mdn_alpha (float):
47
+ Weight for the MDN loss. Defaults to 1.0.
48
+ spec_loss_alpha (float):
49
+ Weight for the MSE spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
50
+ use_speaker_embedding (bool):
51
+ enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
52
+ in the multi-speaker mode. Defaults to False.
53
+ use_d_vector_file (bool):
54
+ enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
55
+ d_vector_file (str):
56
+ Path to the file including pre-computed speaker embeddings. Defaults to None.
57
+ noam_schedule (bool):
58
+ enable / disable the use of Noam LR scheduler. Defaults to False.
59
+ warmup_steps (int):
60
+ Number of warm-up steps for the Noam scheduler. Defaults 4000.
61
+ lr (float):
62
+ Initial learning rate. Defaults to `1e-3`.
63
+ wd (float):
64
+ Weight decay coefficient. Defaults to `1e-7`.
65
+ min_seq_len (int):
66
+ Minimum input sequence length to be used at training.
67
+ max_seq_len (int):
68
+ Maximum input sequence length to be used at training. Larger values result in more VRAM usage."""
69
+
70
+ model: str = "align_tts"
71
+ # model specific params
72
+ model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs)
73
+ phase_start_steps: List[int] = None
74
+
75
+ ssim_alpha: float = 1.0
76
+ spec_loss_alpha: float = 1.0
77
+ dur_loss_alpha: float = 1.0
78
+ mdn_alpha: float = 1.0
79
+
80
+ # multi-speaker settings
81
+ use_speaker_embedding: bool = False
82
+ use_d_vector_file: bool = False
83
+ d_vector_file: str = False
84
+
85
+ # optimizer parameters
86
+ optimizer: str = "Adam"
87
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
88
+ lr_scheduler: str = None
89
+ lr_scheduler_params: dict = None
90
+ lr: float = 1e-4
91
+ grad_clip: float = 5.0
92
+
93
+ # overrides
94
+ min_seq_len: int = 13
95
+ max_seq_len: int = 200
96
+ r: int = 1
97
+
98
+ # testing
99
+ test_sentences: List[str] = field(
100
+ default_factory=lambda: [
101
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
102
+ "Be a voice, not an echo.",
103
+ "I'm sorry Dave. I'm afraid I can't do that.",
104
+ "This cake is great. It's so delicious and moist.",
105
+ "Prior to November 22, 1963.",
106
+ ]
107
+ )
TTS/tts/configs/fast_pitch_config.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from typing import List
3
+
4
+ from TTS.tts.configs.shared_configs import BaseTTSConfig
5
+ from TTS.tts.models.forward_tts import ForwardTTSArgs
6
+
7
+
8
+ @dataclass
9
+ class FastPitchConfig(BaseTTSConfig):
10
+ """Configure `ForwardTTS` as FastPitch model.
11
+
12
+ Example:
13
+
14
+ >>> from TTS.tts.configs.fast_pitch_config import FastPitchConfig
15
+ >>> config = FastPitchConfig()
16
+
17
+ Args:
18
+ model (str):
19
+ Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
20
+
21
+ base_model (str):
22
+ Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
23
+ the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
24
+
25
+ model_args (Coqpit):
26
+ Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
27
+
28
+ data_dep_init_steps (int):
29
+ Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
30
+ Activation Normalization that pre-computes normalization stats at the beginning and use the same values
31
+ for the rest. Defaults to 10.
32
+
33
+ speakers_file (str):
34
+ Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
35
+ speaker names. Defaults to `None`.
36
+
37
+ use_speaker_embedding (bool):
38
+ enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
39
+ in the multi-speaker mode. Defaults to False.
40
+
41
+ use_d_vector_file (bool):
42
+ enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
43
+
44
+ d_vector_file (str):
45
+ Path to the file including pre-computed speaker embeddings. Defaults to None.
46
+
47
+ d_vector_dim (int):
48
+ Dimension of the external speaker embeddings. Defaults to 0.
49
+
50
+ optimizer (str):
51
+ Name of the model optimizer. Defaults to `Adam`.
52
+
53
+ optimizer_params (dict):
54
+ Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
55
+
56
+ lr_scheduler (str):
57
+ Name of the learning rate scheduler. Defaults to `Noam`.
58
+
59
+ lr_scheduler_params (dict):
60
+ Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
61
+
62
+ lr (float):
63
+ Initial learning rate. Defaults to `1e-3`.
64
+
65
+ grad_clip (float):
66
+ Gradient norm clipping value. Defaults to `5.0`.
67
+
68
+ spec_loss_type (str):
69
+ Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
70
+
71
+ duration_loss_type (str):
72
+ Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
73
+
74
+ use_ssim_loss (bool):
75
+ Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
76
+
77
+ wd (float):
78
+ Weight decay coefficient. Defaults to `1e-7`.
79
+
80
+ ssim_loss_alpha (float):
81
+ Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
82
+
83
+ dur_loss_alpha (float):
84
+ Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
85
+
86
+ spec_loss_alpha (float):
87
+ Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
88
+
89
+ pitch_loss_alpha (float):
90
+ Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
91
+
92
+ binary_align_loss_alpha (float):
93
+ Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
94
+
95
+ binary_loss_warmup_epochs (float):
96
+ Number of epochs to gradually increase the binary loss impact. Defaults to 150.
97
+
98
+ min_seq_len (int):
99
+ Minimum input sequence length to be used at training.
100
+
101
+ max_seq_len (int):
102
+ Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
103
+ """
104
+
105
+ model: str = "fast_pitch"
106
+ base_model: str = "forward_tts"
107
+
108
+ # model specific params
109
+ model_args: ForwardTTSArgs = ForwardTTSArgs()
110
+
111
+ # data loader params
112
+ return_wav: bool = False
113
+ compute_linear_spec: bool = False
114
+
115
+ # multi-speaker settings
116
+ num_speakers: int = 0
117
+ speakers_file: str = None
118
+ use_speaker_embedding: bool = False
119
+ use_d_vector_file: bool = False
120
+ d_vector_file: str = False
121
+ d_vector_dim: int = 0
122
+
123
+ # optimizer parameters
124
+ optimizer: str = "Adam"
125
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
126
+ lr_scheduler: str = "NoamLR"
127
+ lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
128
+ lr: float = 1e-4
129
+ grad_clip: float = 5.0
130
+
131
+ # loss params
132
+ spec_loss_type: str = "mse"
133
+ duration_loss_type: str = "mse"
134
+ use_ssim_loss: bool = True
135
+ ssim_loss_alpha: float = 1.0
136
+ spec_loss_alpha: float = 1.0
137
+ aligner_loss_alpha: float = 1.0
138
+ pitch_loss_alpha: float = 0.1
139
+ dur_loss_alpha: float = 0.1
140
+ binary_align_loss_alpha: float = 0.1
141
+ spk_encoder_loss_alpha: float = 0.1
142
+ binary_loss_warmup_epochs: int = 150
143
+ aligner_epochs: int = 1000
144
+
145
+ # overrides
146
+ min_seq_len: int = 13
147
+ max_seq_len: int = 200
148
+ r: int = 1 # DO NOT CHANGE
149
+
150
+ # dataset configs
151
+ compute_f0: bool = True
152
+ f0_cache_path: str = None
153
+
154
+ # testing
155
+ test_sentences: List[str] = field(
156
+ default_factory=lambda: [
157
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
158
+ "Be a voice, not an echo.",
159
+ "I'm sorry Dave. I'm afraid I can't do that.",
160
+ "This cake is great. It's so delicious and moist.",
161
+ "Prior to November 22, 1963.",
162
+ ]
163
+ )
164
+
165
+ def __post_init__(self):
166
+ # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
167
+ if self.num_speakers > 0:
168
+ self.model_args.num_speakers = self.num_speakers
169
+
170
+ # speaker embedding settings
171
+ if self.use_speaker_embedding:
172
+ self.model_args.use_speaker_embedding = True
173
+ if self.speakers_file:
174
+ self.model_args.speakers_file = self.speakers_file
175
+
176
+ # d-vector settings
177
+ if self.use_d_vector_file:
178
+ self.model_args.use_d_vector_file = True
179
+ if self.d_vector_dim is not None and self.d_vector_dim > 0:
180
+ self.model_args.d_vector_dim = self.d_vector_dim
181
+ if self.d_vector_file:
182
+ self.model_args.d_vector_file = self.d_vector_file