Init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- TTS/.models.json +500 -0
- TTS/__init__.py +6 -0
- TTS/bin/__init__.py +0 -0
- TTS/bin/collect_env_info.py +48 -0
- TTS/bin/compute_attention_masks.py +165 -0
- TTS/bin/compute_embeddings.py +84 -0
- TTS/bin/compute_statistics.py +96 -0
- TTS/bin/eval_encoder.py +89 -0
- TTS/bin/extract_tts_spectrograms.py +287 -0
- TTS/bin/find_unique_chars.py +45 -0
- TTS/bin/find_unique_phonemes.py +70 -0
- TTS/bin/remove_silence_using_vad.py +85 -0
- TTS/bin/resample.py +87 -0
- TTS/bin/synthesize.py +425 -0
- TTS/bin/train_encoder.py +319 -0
- TTS/bin/train_tts.py +71 -0
- TTS/bin/train_vocoder.py +77 -0
- TTS/bin/tune_wavegrad.py +100 -0
- TTS/config/__init__.py +132 -0
- TTS/config/shared_configs.py +260 -0
- TTS/encoder/README.md +18 -0
- TTS/encoder/__init__.py +0 -0
- TTS/encoder/configs/base_encoder_config.py +61 -0
- TTS/encoder/configs/emotion_encoder_config.py +12 -0
- TTS/encoder/configs/speaker_encoder_config.py +11 -0
- TTS/encoder/dataset.py +147 -0
- TTS/encoder/losses.py +226 -0
- TTS/encoder/models/base_encoder.py +154 -0
- TTS/encoder/models/lstm.py +99 -0
- TTS/encoder/models/resnet.py +200 -0
- TTS/encoder/requirements.txt +2 -0
- TTS/encoder/utils/__init__.py +0 -0
- TTS/encoder/utils/generic_utils.py +184 -0
- TTS/encoder/utils/io.py +38 -0
- TTS/encoder/utils/prepare_voxceleb.py +219 -0
- TTS/encoder/utils/samplers.py +114 -0
- TTS/encoder/utils/training.py +99 -0
- TTS/encoder/utils/visual.py +50 -0
- TTS/model.py +56 -0
- TTS/server/README.md +18 -0
- TTS/server/__init__.py +0 -0
- TTS/server/conf.json +12 -0
- TTS/server/server.py +190 -0
- TTS/server/static/coqui-log-green-TTS.png +0 -0
- TTS/server/templates/details.html +131 -0
- TTS/server/templates/index.html +143 -0
- TTS/tts/__init__.py +0 -0
- TTS/tts/configs/__init__.py +17 -0
- TTS/tts/configs/align_tts_config.py +107 -0
- TTS/tts/configs/fast_pitch_config.py +182 -0
TTS/.models.json
ADDED
@@ -0,0 +1,500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"tts_models": {
|
3 |
+
"multilingual":{
|
4 |
+
"multi-dataset":{
|
5 |
+
"your_tts":{
|
6 |
+
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
|
7 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
|
8 |
+
"default_vocoder": null,
|
9 |
+
"commit": "e9a1953e",
|
10 |
+
"license": "CC BY-NC-ND 4.0",
|
11 |
+
"contact": "[email protected]"
|
12 |
+
}
|
13 |
+
}
|
14 |
+
},
|
15 |
+
"en": {
|
16 |
+
"ek1": {
|
17 |
+
"tacotron2": {
|
18 |
+
"description": "EK1 en-rp tacotron2 by NMStoker",
|
19 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
|
20 |
+
"default_vocoder": "vocoder_models/en/ek1/wavegrad",
|
21 |
+
"commit": "c802255",
|
22 |
+
"license": "apache 2.0"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"ljspeech": {
|
26 |
+
"tacotron2-DDC": {
|
27 |
+
"description": "Tacotron2 with Double Decoder Consistency.",
|
28 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
|
29 |
+
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
30 |
+
"commit": "bae2ad0f",
|
31 |
+
"author": "Eren Gölge @erogol",
|
32 |
+
"license": "apache 2.0",
|
33 |
+
"contact": "[email protected]"
|
34 |
+
},
|
35 |
+
"tacotron2-DDC_ph": {
|
36 |
+
"description": "Tacotron2 with Double Decoder Consistency with phonemes.",
|
37 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
|
38 |
+
"default_vocoder": "vocoder_models/en/ljspeech/univnet",
|
39 |
+
"commit": "3900448",
|
40 |
+
"author": "Eren Gölge @erogol",
|
41 |
+
"license": "apache 2.0",
|
42 |
+
"contact": "[email protected]"
|
43 |
+
},
|
44 |
+
"glow-tts": {
|
45 |
+
"description": "",
|
46 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
|
47 |
+
"stats_file": null,
|
48 |
+
"default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
|
49 |
+
"commit": "",
|
50 |
+
"author": "Eren Gölge @erogol",
|
51 |
+
"license": "MPL",
|
52 |
+
"contact": "[email protected]"
|
53 |
+
},
|
54 |
+
"speedy-speech": {
|
55 |
+
"description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
|
56 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
|
57 |
+
"stats_file": null,
|
58 |
+
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
59 |
+
"commit": "4581e3d",
|
60 |
+
"author": "Eren Gölge @erogol",
|
61 |
+
"license": "apache 2.0",
|
62 |
+
"contact": "[email protected]"
|
63 |
+
},
|
64 |
+
"tacotron2-DCA": {
|
65 |
+
"description": "",
|
66 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
|
67 |
+
"default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
|
68 |
+
"commit": "",
|
69 |
+
"author": "Eren Gölge @erogol",
|
70 |
+
"license": "MPL",
|
71 |
+
"contact": "[email protected]"
|
72 |
+
},
|
73 |
+
"vits": {
|
74 |
+
"description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
|
75 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
|
76 |
+
"default_vocoder": null,
|
77 |
+
"commit": "3900448",
|
78 |
+
"author": "Eren Gölge @erogol",
|
79 |
+
"license": "apache 2.0",
|
80 |
+
"contact": "[email protected]"
|
81 |
+
},
|
82 |
+
"fast_pitch": {
|
83 |
+
"description": "FastPitch model trained on LJSpeech using the Aligner Network",
|
84 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
|
85 |
+
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
86 |
+
"commit": "b27b3ba",
|
87 |
+
"author": "Eren Gölge @erogol",
|
88 |
+
"license": "apache 2.0",
|
89 |
+
"contact": "[email protected]"
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"vctk": {
|
93 |
+
"vits": {
|
94 |
+
"description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
|
95 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
|
96 |
+
"default_vocoder": null,
|
97 |
+
"commit": "3900448",
|
98 |
+
"author": "Eren @erogol",
|
99 |
+
"license": "apache 2.0",
|
100 |
+
"contact": "[email protected]"
|
101 |
+
},
|
102 |
+
"fast_pitch":{
|
103 |
+
"description": "FastPitch model trained on VCTK dataseset.",
|
104 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
|
105 |
+
"default_vocoder": null,
|
106 |
+
"commit": "bdab788d",
|
107 |
+
"author": "Eren @erogol",
|
108 |
+
"license": "CC BY-NC-ND 4.0",
|
109 |
+
"contact": "[email protected]"
|
110 |
+
}
|
111 |
+
},
|
112 |
+
"sam": {
|
113 |
+
"tacotron-DDC": {
|
114 |
+
"description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
|
115 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
|
116 |
+
"default_vocoder": "vocoder_models/en/sam/hifigan_v2",
|
117 |
+
"commit": "bae2ad0f",
|
118 |
+
"author": "Eren Gölge @erogol",
|
119 |
+
"license": "apache 2.0",
|
120 |
+
"contact": "[email protected]"
|
121 |
+
}
|
122 |
+
},
|
123 |
+
"blizzard2013": {
|
124 |
+
"capacitron-t2-c50": {
|
125 |
+
"description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
|
126 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
|
127 |
+
"commit": "d6284e7",
|
128 |
+
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
|
129 |
+
"author": "Adam Froghyar @a-froghyar",
|
130 |
+
"license": "apache 2.0",
|
131 |
+
"contact": "[email protected]"
|
132 |
+
},
|
133 |
+
"capacitron-t2-c150": {
|
134 |
+
"description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
|
135 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip",
|
136 |
+
"commit": "d6284e7",
|
137 |
+
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
|
138 |
+
"author": "Adam Froghyar @a-froghyar",
|
139 |
+
"license": "apache 2.0",
|
140 |
+
"contact": "[email protected]"
|
141 |
+
}
|
142 |
+
}
|
143 |
+
},
|
144 |
+
"es": {
|
145 |
+
"mai": {
|
146 |
+
"tacotron2-DDC": {
|
147 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
|
148 |
+
"default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
|
149 |
+
"commit": "",
|
150 |
+
"author": "Eren Gölge @erogol",
|
151 |
+
"license": "MPL",
|
152 |
+
"contact": "[email protected]"
|
153 |
+
}
|
154 |
+
}
|
155 |
+
},
|
156 |
+
"fr": {
|
157 |
+
"mai": {
|
158 |
+
"tacotron2-DDC": {
|
159 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
|
160 |
+
"default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
|
161 |
+
"commit": "",
|
162 |
+
"author": "Eren Gölge @erogol",
|
163 |
+
"license": "MPL",
|
164 |
+
"contact": "[email protected]"
|
165 |
+
}
|
166 |
+
}
|
167 |
+
},
|
168 |
+
"uk":{
|
169 |
+
"mai": {
|
170 |
+
"glow-tts": {
|
171 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
|
172 |
+
"author":"@robinhad",
|
173 |
+
"commit": "bdab788d",
|
174 |
+
"license": "MIT",
|
175 |
+
"contact": "",
|
176 |
+
"default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
|
177 |
+
}
|
178 |
+
}
|
179 |
+
},
|
180 |
+
"zh-CN": {
|
181 |
+
"baker": {
|
182 |
+
"tacotron2-DDC-GST": {
|
183 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
|
184 |
+
"commit": "unknown",
|
185 |
+
"author": "@kirianguiller",
|
186 |
+
"license": "apache 2.0",
|
187 |
+
"default_vocoder": null
|
188 |
+
}
|
189 |
+
}
|
190 |
+
},
|
191 |
+
"nl": {
|
192 |
+
"mai": {
|
193 |
+
"tacotron2-DDC": {
|
194 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
|
195 |
+
"author": "@r-dh",
|
196 |
+
"license": "apache 2.0",
|
197 |
+
"default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
|
198 |
+
"stats_file": null,
|
199 |
+
"commit": "540d811"
|
200 |
+
}
|
201 |
+
}
|
202 |
+
},
|
203 |
+
"de": {
|
204 |
+
"thorsten": {
|
205 |
+
"tacotron2-DCA": {
|
206 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
|
207 |
+
"default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
|
208 |
+
"author": "@thorstenMueller",
|
209 |
+
"license": "apache 2.0",
|
210 |
+
"commit": "unknown"
|
211 |
+
},
|
212 |
+
"vits": {
|
213 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
|
214 |
+
"default_vocoder": null,
|
215 |
+
"author": "@thorstenMueller",
|
216 |
+
"license": "apache 2.0",
|
217 |
+
"commit": "unknown"
|
218 |
+
}
|
219 |
+
}
|
220 |
+
},
|
221 |
+
"ja": {
|
222 |
+
"kokoro": {
|
223 |
+
"tacotron2-DDC": {
|
224 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
|
225 |
+
"default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
|
226 |
+
"description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
|
227 |
+
"author": "@kaiidams",
|
228 |
+
"license": "apache 2.0",
|
229 |
+
"commit": "401fbd89"
|
230 |
+
}
|
231 |
+
}
|
232 |
+
},
|
233 |
+
"tr":{
|
234 |
+
"common-voice": {
|
235 |
+
"glow-tts":{
|
236 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
|
237 |
+
"default_vocoder": "vocoder_models/tr/common-voice/hifigan",
|
238 |
+
"license": "MIT",
|
239 |
+
"description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
|
240 |
+
"author": "Fatih Akademi",
|
241 |
+
"commit": null
|
242 |
+
}
|
243 |
+
}
|
244 |
+
},
|
245 |
+
"it": {
|
246 |
+
"mai_female": {
|
247 |
+
"glow-tts":{
|
248 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
|
249 |
+
"default_vocoder": null,
|
250 |
+
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
251 |
+
"author": "@nicolalandro",
|
252 |
+
"license": "apache 2.0",
|
253 |
+
"commit": null
|
254 |
+
},
|
255 |
+
"vits":{
|
256 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
|
257 |
+
"default_vocoder": null,
|
258 |
+
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
259 |
+
"author": "@nicolalandro",
|
260 |
+
"license": "apache 2.0",
|
261 |
+
"commit": null
|
262 |
+
}
|
263 |
+
},
|
264 |
+
"mai_male": {
|
265 |
+
"glow-tts":{
|
266 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
|
267 |
+
"default_vocoder": null,
|
268 |
+
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
269 |
+
"author": "@nicolalandro",
|
270 |
+
"license": "apache 2.0",
|
271 |
+
"commit": null
|
272 |
+
},
|
273 |
+
"vits":{
|
274 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
|
275 |
+
"default_vocoder": null,
|
276 |
+
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
277 |
+
"author": "@nicolalandro",
|
278 |
+
"license": "apache 2.0",
|
279 |
+
"commit": null
|
280 |
+
}
|
281 |
+
}
|
282 |
+
},
|
283 |
+
"ewe": {
|
284 |
+
"openbible": {
|
285 |
+
"vits":{
|
286 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
|
287 |
+
"default_vocoder": null,
|
288 |
+
"license": "CC-BY-SA 4.0",
|
289 |
+
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
290 |
+
"author": "@coqui_ai",
|
291 |
+
"commit": "1b22f03"
|
292 |
+
}
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"hau": {
|
296 |
+
"openbible": {
|
297 |
+
"vits":{
|
298 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
|
299 |
+
"default_vocoder": null,
|
300 |
+
"license": "CC-BY-SA 4.0",
|
301 |
+
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
302 |
+
"author": "@coqui_ai",
|
303 |
+
"commit": "1b22f03"
|
304 |
+
}
|
305 |
+
}
|
306 |
+
},
|
307 |
+
"lin": {
|
308 |
+
"openbible": {
|
309 |
+
"vits":{
|
310 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
|
311 |
+
"default_vocoder": null,
|
312 |
+
"license": "CC-BY-SA 4.0",
|
313 |
+
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
314 |
+
"author": "@coqui_ai",
|
315 |
+
"commit": "1b22f03"
|
316 |
+
}
|
317 |
+
}
|
318 |
+
},
|
319 |
+
"tw_akuapem": {
|
320 |
+
"openbible": {
|
321 |
+
"vits":{
|
322 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
|
323 |
+
"default_vocoder": null,
|
324 |
+
"license": "CC-BY-SA 4.0",
|
325 |
+
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
326 |
+
"author": "@coqui_ai",
|
327 |
+
"commit": "1b22f03"
|
328 |
+
}
|
329 |
+
}
|
330 |
+
},
|
331 |
+
"tw_asante": {
|
332 |
+
"openbible": {
|
333 |
+
"vits":{
|
334 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
|
335 |
+
"default_vocoder": null,
|
336 |
+
"license": "CC-BY-SA 4.0",
|
337 |
+
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
338 |
+
"author": "@coqui_ai",
|
339 |
+
"commit": "1b22f03"
|
340 |
+
}
|
341 |
+
}
|
342 |
+
},
|
343 |
+
"yor": {
|
344 |
+
"openbible": {
|
345 |
+
"vits":{
|
346 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
|
347 |
+
"default_vocoder": null,
|
348 |
+
"license": "CC-BY-SA 4.0",
|
349 |
+
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
350 |
+
"author": "@coqui_ai",
|
351 |
+
"commit": "1b22f03"
|
352 |
+
}
|
353 |
+
}
|
354 |
+
}
|
355 |
+
},
|
356 |
+
"vocoder_models": {
|
357 |
+
"universal": {
|
358 |
+
"libri-tts": {
|
359 |
+
"wavegrad": {
|
360 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
|
361 |
+
"commit": "ea976b0",
|
362 |
+
"author": "Eren Gölge @erogol",
|
363 |
+
"license": "MPL",
|
364 |
+
"contact": "[email protected]"
|
365 |
+
},
|
366 |
+
"fullband-melgan": {
|
367 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
|
368 |
+
"commit": "4132240",
|
369 |
+
"author": "Eren Gölge @erogol",
|
370 |
+
"license": "MPL",
|
371 |
+
"contact": "[email protected]"
|
372 |
+
}
|
373 |
+
}
|
374 |
+
},
|
375 |
+
"en": {
|
376 |
+
"ek1": {
|
377 |
+
"wavegrad": {
|
378 |
+
"description": "EK1 en-rp wavegrad by NMStoker",
|
379 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
|
380 |
+
"commit": "c802255",
|
381 |
+
"license": "apache 2.0"
|
382 |
+
}
|
383 |
+
},
|
384 |
+
"ljspeech": {
|
385 |
+
"multiband-melgan": {
|
386 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
|
387 |
+
"commit": "ea976b0",
|
388 |
+
"author": "Eren Gölge @erogol",
|
389 |
+
"license": "MPL",
|
390 |
+
"contact": "[email protected]"
|
391 |
+
},
|
392 |
+
"hifigan_v2": {
|
393 |
+
"description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
|
394 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
|
395 |
+
"commit": "bae2ad0f",
|
396 |
+
"author": "@erogol",
|
397 |
+
"license": "apache 2.0",
|
398 |
+
"contact": "[email protected]"
|
399 |
+
},
|
400 |
+
"univnet": {
|
401 |
+
"description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
|
402 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
|
403 |
+
"commit": "4581e3d",
|
404 |
+
"author": "Eren @erogol",
|
405 |
+
"license": "apache 2.0",
|
406 |
+
"contact": "[email protected]"
|
407 |
+
}
|
408 |
+
},
|
409 |
+
"blizzard2013": {
|
410 |
+
"hifigan_v2": {
|
411 |
+
"description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
|
412 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
|
413 |
+
"commit": "d6284e7",
|
414 |
+
"author": "Adam Froghyar @a-froghyar",
|
415 |
+
"license": "apache 2.0",
|
416 |
+
"contact": "[email protected]"
|
417 |
+
}
|
418 |
+
},
|
419 |
+
"vctk": {
|
420 |
+
"hifigan_v2": {
|
421 |
+
"description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
|
422 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
|
423 |
+
"commit": "2f07160",
|
424 |
+
"author": "Edresson Casanova",
|
425 |
+
"license": "apache 2.0",
|
426 |
+
"contact": ""
|
427 |
+
}
|
428 |
+
},
|
429 |
+
"sam": {
|
430 |
+
"hifigan_v2": {
|
431 |
+
"description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
|
432 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
|
433 |
+
"commit": "2f07160",
|
434 |
+
"author": "Eren Gölge @erogol",
|
435 |
+
"license": "apache 2.0",
|
436 |
+
"contact": "[email protected]"
|
437 |
+
}
|
438 |
+
}
|
439 |
+
},
|
440 |
+
"nl": {
|
441 |
+
"mai": {
|
442 |
+
"parallel-wavegan": {
|
443 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
|
444 |
+
"author": "@r-dh",
|
445 |
+
"license": "apache 2.0",
|
446 |
+
"commit": "unknown"
|
447 |
+
}
|
448 |
+
}
|
449 |
+
},
|
450 |
+
"de": {
|
451 |
+
"thorsten": {
|
452 |
+
"wavegrad": {
|
453 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
|
454 |
+
"author": "@thorstenMueller",
|
455 |
+
"license": "apache 2.0",
|
456 |
+
"commit": "unknown"
|
457 |
+
},
|
458 |
+
"fullband-melgan": {
|
459 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
|
460 |
+
"author": "@thorstenMueller",
|
461 |
+
"license": "apache 2.0",
|
462 |
+
"commit": "unknown"
|
463 |
+
}
|
464 |
+
}
|
465 |
+
},
|
466 |
+
"ja": {
|
467 |
+
"kokoro": {
|
468 |
+
"hifigan_v1": {
|
469 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
|
470 |
+
"description": "HifiGAN model trained for kokoro dataset by @kaiidams",
|
471 |
+
"author": "@kaiidams",
|
472 |
+
"license": "apache 2.0",
|
473 |
+
"commit": "3900448"
|
474 |
+
}
|
475 |
+
}
|
476 |
+
},
|
477 |
+
"uk": {
|
478 |
+
"mai": {
|
479 |
+
"multiband-melgan": {
|
480 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
|
481 |
+
"author":"@robinhad",
|
482 |
+
"commit": "bdab788d",
|
483 |
+
"license": "MIT",
|
484 |
+
"contact": ""
|
485 |
+
}
|
486 |
+
}
|
487 |
+
},
|
488 |
+
"tr":{
|
489 |
+
"common-voice": {
|
490 |
+
"hifigan":{
|
491 |
+
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
|
492 |
+
"description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
|
493 |
+
"author": "Fatih Akademi",
|
494 |
+
"license": "MIT",
|
495 |
+
"commit": null
|
496 |
+
}
|
497 |
+
}
|
498 |
+
}
|
499 |
+
}
|
500 |
+
}
|
TTS/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
|
4 |
+
version = f.read().strip()
|
5 |
+
|
6 |
+
__version__ = version
|
TTS/bin/__init__.py
ADDED
File without changes
|
TTS/bin/collect_env_info.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Get detailed info about the working environment."""
|
2 |
+
import os
|
3 |
+
import platform
|
4 |
+
import sys
|
5 |
+
|
6 |
+
import numpy
|
7 |
+
import torch
|
8 |
+
|
9 |
+
sys.path += [os.path.abspath(".."), os.path.abspath(".")]
|
10 |
+
import json
|
11 |
+
|
12 |
+
import TTS
|
13 |
+
|
14 |
+
|
15 |
+
def system_info():
|
16 |
+
return {
|
17 |
+
"OS": platform.system(),
|
18 |
+
"architecture": platform.architecture(),
|
19 |
+
"version": platform.version(),
|
20 |
+
"processor": platform.processor(),
|
21 |
+
"python": platform.python_version(),
|
22 |
+
}
|
23 |
+
|
24 |
+
|
25 |
+
def cuda_info():
|
26 |
+
return {
|
27 |
+
"GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
|
28 |
+
"available": torch.cuda.is_available(),
|
29 |
+
"version": torch.version.cuda,
|
30 |
+
}
|
31 |
+
|
32 |
+
|
33 |
+
def package_info():
|
34 |
+
return {
|
35 |
+
"numpy": numpy.__version__,
|
36 |
+
"PyTorch_version": torch.__version__,
|
37 |
+
"PyTorch_debug": torch.version.debug,
|
38 |
+
"TTS": TTS.__version__,
|
39 |
+
}
|
40 |
+
|
41 |
+
|
42 |
+
def main():
|
43 |
+
details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
|
44 |
+
print(json.dumps(details, indent=4, sort_keys=True))
|
45 |
+
|
46 |
+
|
47 |
+
if __name__ == "__main__":
|
48 |
+
main()
|
TTS/bin/compute_attention_masks.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import importlib
|
3 |
+
import os
|
4 |
+
from argparse import RawTextHelpFormatter
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import torch
|
8 |
+
from torch.utils.data import DataLoader
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
from TTS.config import load_config
|
12 |
+
from TTS.tts.datasets.TTSDataset import TTSDataset
|
13 |
+
from TTS.tts.models import setup_model
|
14 |
+
from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
|
15 |
+
from TTS.utils.audio import AudioProcessor
|
16 |
+
from TTS.utils.io import load_checkpoint
|
17 |
+
|
18 |
+
if __name__ == "__main__":
|
19 |
+
# pylint: disable=bad-option-value
|
20 |
+
parser = argparse.ArgumentParser(
|
21 |
+
description="""Extract attention masks from trained Tacotron/Tacotron2 models.
|
22 |
+
These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
|
23 |
+
"""Each attention mask is written to the same path as the input wav file with ".npy" file extension.
|
24 |
+
(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
|
25 |
+
"""
|
26 |
+
Example run:
|
27 |
+
CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
|
28 |
+
--model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
|
29 |
+
--config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
|
30 |
+
--dataset_metafile metadata.csv
|
31 |
+
--data_path /root/LJSpeech-1.1/
|
32 |
+
--batch_size 32
|
33 |
+
--dataset ljspeech
|
34 |
+
--use_cuda True
|
35 |
+
""",
|
36 |
+
formatter_class=RawTextHelpFormatter,
|
37 |
+
)
|
38 |
+
parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
|
39 |
+
parser.add_argument(
|
40 |
+
"--config_path",
|
41 |
+
type=str,
|
42 |
+
required=True,
|
43 |
+
help="Path to Tacotron/Tacotron2 config file.",
|
44 |
+
)
|
45 |
+
parser.add_argument(
|
46 |
+
"--dataset",
|
47 |
+
type=str,
|
48 |
+
default="",
|
49 |
+
required=True,
|
50 |
+
help="Target dataset processor name from TTS.tts.dataset.preprocess.",
|
51 |
+
)
|
52 |
+
|
53 |
+
parser.add_argument(
|
54 |
+
"--dataset_metafile",
|
55 |
+
type=str,
|
56 |
+
default="",
|
57 |
+
required=True,
|
58 |
+
help="Dataset metafile inclusing file paths with transcripts.",
|
59 |
+
)
|
60 |
+
parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
|
61 |
+
parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
|
62 |
+
|
63 |
+
parser.add_argument(
|
64 |
+
"--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
|
65 |
+
)
|
66 |
+
args = parser.parse_args()
|
67 |
+
|
68 |
+
C = load_config(args.config_path)
|
69 |
+
ap = AudioProcessor(**C.audio)
|
70 |
+
|
71 |
+
# if the vocabulary was passed, replace the default
|
72 |
+
if "characters" in C.keys():
|
73 |
+
symbols, phonemes = make_symbols(**C.characters)
|
74 |
+
|
75 |
+
# load the model
|
76 |
+
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
|
77 |
+
# TODO: handle multi-speaker
|
78 |
+
model = setup_model(C)
|
79 |
+
model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
|
80 |
+
|
81 |
+
# data loader
|
82 |
+
preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
|
83 |
+
preprocessor = getattr(preprocessor, args.dataset)
|
84 |
+
meta_data = preprocessor(args.data_path, args.dataset_metafile)
|
85 |
+
dataset = TTSDataset(
|
86 |
+
model.decoder.r,
|
87 |
+
C.text_cleaner,
|
88 |
+
compute_linear_spec=False,
|
89 |
+
ap=ap,
|
90 |
+
meta_data=meta_data,
|
91 |
+
characters=C.characters if "characters" in C.keys() else None,
|
92 |
+
add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
|
93 |
+
use_phonemes=C.use_phonemes,
|
94 |
+
phoneme_cache_path=C.phoneme_cache_path,
|
95 |
+
phoneme_language=C.phoneme_language,
|
96 |
+
enable_eos_bos=C.enable_eos_bos_chars,
|
97 |
+
)
|
98 |
+
|
99 |
+
dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
|
100 |
+
loader = DataLoader(
|
101 |
+
dataset,
|
102 |
+
batch_size=args.batch_size,
|
103 |
+
num_workers=4,
|
104 |
+
collate_fn=dataset.collate_fn,
|
105 |
+
shuffle=False,
|
106 |
+
drop_last=False,
|
107 |
+
)
|
108 |
+
|
109 |
+
# compute attentions
|
110 |
+
file_paths = []
|
111 |
+
with torch.no_grad():
|
112 |
+
for data in tqdm(loader):
|
113 |
+
# setup input data
|
114 |
+
text_input = data[0]
|
115 |
+
text_lengths = data[1]
|
116 |
+
linear_input = data[3]
|
117 |
+
mel_input = data[4]
|
118 |
+
mel_lengths = data[5]
|
119 |
+
stop_targets = data[6]
|
120 |
+
item_idxs = data[7]
|
121 |
+
|
122 |
+
# dispatch data to GPU
|
123 |
+
if args.use_cuda:
|
124 |
+
text_input = text_input.cuda()
|
125 |
+
text_lengths = text_lengths.cuda()
|
126 |
+
mel_input = mel_input.cuda()
|
127 |
+
mel_lengths = mel_lengths.cuda()
|
128 |
+
|
129 |
+
model_outputs = model.forward(text_input, text_lengths, mel_input)
|
130 |
+
|
131 |
+
alignments = model_outputs["alignments"].detach()
|
132 |
+
for idx, alignment in enumerate(alignments):
|
133 |
+
item_idx = item_idxs[idx]
|
134 |
+
# interpolate if r > 1
|
135 |
+
alignment = (
|
136 |
+
torch.nn.functional.interpolate(
|
137 |
+
alignment.transpose(0, 1).unsqueeze(0),
|
138 |
+
size=None,
|
139 |
+
scale_factor=model.decoder.r,
|
140 |
+
mode="nearest",
|
141 |
+
align_corners=None,
|
142 |
+
recompute_scale_factor=None,
|
143 |
+
)
|
144 |
+
.squeeze(0)
|
145 |
+
.transpose(0, 1)
|
146 |
+
)
|
147 |
+
# remove paddings
|
148 |
+
alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
|
149 |
+
# set file paths
|
150 |
+
wav_file_name = os.path.basename(item_idx)
|
151 |
+
align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
|
152 |
+
file_path = item_idx.replace(wav_file_name, align_file_name)
|
153 |
+
# save output
|
154 |
+
wav_file_abs_path = os.path.abspath(item_idx)
|
155 |
+
file_abs_path = os.path.abspath(file_path)
|
156 |
+
file_paths.append([wav_file_abs_path, file_abs_path])
|
157 |
+
np.save(file_path, alignment)
|
158 |
+
|
159 |
+
# ourput metafile
|
160 |
+
metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
|
161 |
+
|
162 |
+
with open(metafile, "w", encoding="utf-8") as f:
|
163 |
+
for p in file_paths:
|
164 |
+
f.write(f"{p[0]}|{p[1]}\n")
|
165 |
+
print(f" >> Metafile created: {metafile}")
|
TTS/bin/compute_embeddings.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
from argparse import RawTextHelpFormatter
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
+
from TTS.config import load_config
|
9 |
+
from TTS.tts.datasets import load_tts_samples
|
10 |
+
from TTS.tts.utils.managers import save_file
|
11 |
+
from TTS.tts.utils.speakers import SpeakerManager
|
12 |
+
|
13 |
+
parser = argparse.ArgumentParser(
|
14 |
+
description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
|
15 |
+
"""
|
16 |
+
Example runs:
|
17 |
+
python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json dataset_config.json
|
18 |
+
""",
|
19 |
+
formatter_class=RawTextHelpFormatter,
|
20 |
+
)
|
21 |
+
parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
|
22 |
+
parser.add_argument("config_path", type=str, help="Path to model config file.")
|
23 |
+
parser.add_argument("config_dataset_path", type=str, help="Path to dataset config file.")
|
24 |
+
parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
|
25 |
+
parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
|
26 |
+
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
|
27 |
+
parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
|
28 |
+
|
29 |
+
args = parser.parse_args()
|
30 |
+
|
31 |
+
use_cuda = torch.cuda.is_available() and not args.disable_cuda
|
32 |
+
|
33 |
+
c_dataset = load_config(args.config_dataset_path)
|
34 |
+
|
35 |
+
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
|
36 |
+
|
37 |
+
if meta_data_eval is None:
|
38 |
+
wav_files = meta_data_train
|
39 |
+
else:
|
40 |
+
wav_files = meta_data_train + meta_data_eval
|
41 |
+
|
42 |
+
encoder_manager = SpeakerManager(
|
43 |
+
encoder_model_path=args.model_path,
|
44 |
+
encoder_config_path=args.config_path,
|
45 |
+
d_vectors_file_path=args.old_file,
|
46 |
+
use_cuda=use_cuda,
|
47 |
+
)
|
48 |
+
|
49 |
+
class_name_key = encoder_manager.encoder_config.class_name_key
|
50 |
+
|
51 |
+
# compute speaker embeddings
|
52 |
+
speaker_mapping = {}
|
53 |
+
for idx, wav_file in enumerate(tqdm(wav_files)):
|
54 |
+
if isinstance(wav_file, dict):
|
55 |
+
class_name = wav_file[class_name_key]
|
56 |
+
wav_file = wav_file["audio_file"]
|
57 |
+
else:
|
58 |
+
class_name = None
|
59 |
+
|
60 |
+
wav_file_name = os.path.basename(wav_file)
|
61 |
+
if args.old_file is not None and wav_file_name in encoder_manager.clip_ids:
|
62 |
+
# get the embedding from the old file
|
63 |
+
embedd = encoder_manager.get_embedding_by_clip(wav_file_name)
|
64 |
+
else:
|
65 |
+
# extract the embedding
|
66 |
+
embedd = encoder_manager.compute_embedding_from_clip(wav_file)
|
67 |
+
|
68 |
+
# create speaker_mapping if target dataset is defined
|
69 |
+
speaker_mapping[wav_file_name] = {}
|
70 |
+
speaker_mapping[wav_file_name]["name"] = class_name
|
71 |
+
speaker_mapping[wav_file_name]["embedding"] = embedd
|
72 |
+
|
73 |
+
if speaker_mapping:
|
74 |
+
# save speaker_mapping if target dataset is defined
|
75 |
+
if os.path.isdir(args.output_path):
|
76 |
+
mapping_file_path = os.path.join(args.output_path, "speakers.pth")
|
77 |
+
else:
|
78 |
+
mapping_file_path = args.output_path
|
79 |
+
|
80 |
+
if os.path.dirname(mapping_file_path) != "":
|
81 |
+
os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
|
82 |
+
|
83 |
+
save_file(speaker_mapping, mapping_file_path)
|
84 |
+
print("Speaker embeddings saved at:", mapping_file_path)
|
TTS/bin/compute_statistics.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import argparse
|
5 |
+
import glob
|
6 |
+
import os
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
# from TTS.utils.io import load_config
|
12 |
+
from TTS.config import load_config
|
13 |
+
from TTS.tts.datasets import load_tts_samples
|
14 |
+
from TTS.utils.audio import AudioProcessor
|
15 |
+
|
16 |
+
|
17 |
+
def main():
|
18 |
+
"""Run preprocessing process."""
|
19 |
+
parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
|
20 |
+
parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
|
21 |
+
parser.add_argument("out_path", type=str, help="save path (directory and filename).")
|
22 |
+
parser.add_argument(
|
23 |
+
"--data_path",
|
24 |
+
type=str,
|
25 |
+
required=False,
|
26 |
+
help="folder including the target set of wavs overriding dataset config.",
|
27 |
+
)
|
28 |
+
args, overrides = parser.parse_known_args()
|
29 |
+
|
30 |
+
CONFIG = load_config(args.config_path)
|
31 |
+
CONFIG.parse_known_args(overrides, relaxed_parser=True)
|
32 |
+
|
33 |
+
# load config
|
34 |
+
CONFIG.audio.signal_norm = False # do not apply earlier normalization
|
35 |
+
CONFIG.audio.stats_path = None # discard pre-defined stats
|
36 |
+
|
37 |
+
# load audio processor
|
38 |
+
ap = AudioProcessor(**CONFIG.audio.to_dict())
|
39 |
+
|
40 |
+
# load the meta data of target dataset
|
41 |
+
if args.data_path:
|
42 |
+
dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
|
43 |
+
else:
|
44 |
+
dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
|
45 |
+
print(f" > There are {len(dataset_items)} files.")
|
46 |
+
|
47 |
+
mel_sum = 0
|
48 |
+
mel_square_sum = 0
|
49 |
+
linear_sum = 0
|
50 |
+
linear_square_sum = 0
|
51 |
+
N = 0
|
52 |
+
for item in tqdm(dataset_items):
|
53 |
+
# compute features
|
54 |
+
wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
|
55 |
+
linear = ap.spectrogram(wav)
|
56 |
+
mel = ap.melspectrogram(wav)
|
57 |
+
|
58 |
+
# compute stats
|
59 |
+
N += mel.shape[1]
|
60 |
+
mel_sum += mel.sum(1)
|
61 |
+
linear_sum += linear.sum(1)
|
62 |
+
mel_square_sum += (mel**2).sum(axis=1)
|
63 |
+
linear_square_sum += (linear**2).sum(axis=1)
|
64 |
+
|
65 |
+
mel_mean = mel_sum / N
|
66 |
+
mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
|
67 |
+
linear_mean = linear_sum / N
|
68 |
+
linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
|
69 |
+
|
70 |
+
output_file_path = args.out_path
|
71 |
+
stats = {}
|
72 |
+
stats["mel_mean"] = mel_mean
|
73 |
+
stats["mel_std"] = mel_scale
|
74 |
+
stats["linear_mean"] = linear_mean
|
75 |
+
stats["linear_std"] = linear_scale
|
76 |
+
|
77 |
+
print(f" > Avg mel spec mean: {mel_mean.mean()}")
|
78 |
+
print(f" > Avg mel spec scale: {mel_scale.mean()}")
|
79 |
+
print(f" > Avg linear spec mean: {linear_mean.mean()}")
|
80 |
+
print(f" > Avg linear spec scale: {linear_scale.mean()}")
|
81 |
+
|
82 |
+
# set default config values for mean-var scaling
|
83 |
+
CONFIG.audio.stats_path = output_file_path
|
84 |
+
CONFIG.audio.signal_norm = True
|
85 |
+
# remove redundant values
|
86 |
+
del CONFIG.audio.max_norm
|
87 |
+
del CONFIG.audio.min_level_db
|
88 |
+
del CONFIG.audio.symmetric_norm
|
89 |
+
del CONFIG.audio.clip_norm
|
90 |
+
stats["audio_config"] = CONFIG.audio.to_dict()
|
91 |
+
np.save(output_file_path, stats, allow_pickle=True)
|
92 |
+
print(f" > stats saved to {output_file_path}")
|
93 |
+
|
94 |
+
|
95 |
+
if __name__ == "__main__":
|
96 |
+
main()
|
TTS/bin/eval_encoder.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from argparse import RawTextHelpFormatter
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
from TTS.config import load_config
|
8 |
+
from TTS.tts.datasets import load_tts_samples
|
9 |
+
from TTS.tts.utils.speakers import SpeakerManager
|
10 |
+
|
11 |
+
|
12 |
+
def compute_encoder_accuracy(dataset_items, encoder_manager):
|
13 |
+
|
14 |
+
class_name_key = encoder_manager.encoder_config.class_name_key
|
15 |
+
map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
|
16 |
+
|
17 |
+
class_acc_dict = {}
|
18 |
+
|
19 |
+
# compute embeddings for all wav_files
|
20 |
+
for item in tqdm(dataset_items):
|
21 |
+
class_name = item[class_name_key]
|
22 |
+
wav_file = item["audio_file"]
|
23 |
+
|
24 |
+
# extract the embedding
|
25 |
+
embedd = encoder_manager.compute_embedding_from_clip(wav_file)
|
26 |
+
if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
|
27 |
+
embedding = torch.FloatTensor(embedd).unsqueeze(0)
|
28 |
+
if encoder_manager.use_cuda:
|
29 |
+
embedding = embedding.cuda()
|
30 |
+
|
31 |
+
class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
|
32 |
+
predicted_label = map_classid_to_classname[str(class_id)]
|
33 |
+
else:
|
34 |
+
predicted_label = None
|
35 |
+
|
36 |
+
if class_name is not None and predicted_label is not None:
|
37 |
+
is_equal = int(class_name == predicted_label)
|
38 |
+
if class_name not in class_acc_dict:
|
39 |
+
class_acc_dict[class_name] = [is_equal]
|
40 |
+
else:
|
41 |
+
class_acc_dict[class_name].append(is_equal)
|
42 |
+
else:
|
43 |
+
raise RuntimeError("Error: class_name or/and predicted_label are None")
|
44 |
+
|
45 |
+
acc_avg = 0
|
46 |
+
for key, values in class_acc_dict.items():
|
47 |
+
acc = sum(values) / len(values)
|
48 |
+
print("Class", key, "Accuracy:", acc)
|
49 |
+
acc_avg += acc
|
50 |
+
|
51 |
+
print("Average Accuracy:", acc_avg / len(class_acc_dict))
|
52 |
+
|
53 |
+
|
54 |
+
if __name__ == "__main__":
|
55 |
+
parser = argparse.ArgumentParser(
|
56 |
+
description="""Compute the accuracy of the encoder.\n\n"""
|
57 |
+
"""
|
58 |
+
Example runs:
|
59 |
+
python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
|
60 |
+
""",
|
61 |
+
formatter_class=RawTextHelpFormatter,
|
62 |
+
)
|
63 |
+
parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
|
64 |
+
parser.add_argument(
|
65 |
+
"config_path",
|
66 |
+
type=str,
|
67 |
+
help="Path to model config file.",
|
68 |
+
)
|
69 |
+
|
70 |
+
parser.add_argument(
|
71 |
+
"config_dataset_path",
|
72 |
+
type=str,
|
73 |
+
help="Path to dataset config file.",
|
74 |
+
)
|
75 |
+
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
|
76 |
+
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
|
77 |
+
|
78 |
+
args = parser.parse_args()
|
79 |
+
|
80 |
+
c_dataset = load_config(args.config_dataset_path)
|
81 |
+
|
82 |
+
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
|
83 |
+
items = meta_data_train + meta_data_eval
|
84 |
+
|
85 |
+
enc_manager = SpeakerManager(
|
86 |
+
encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
|
87 |
+
)
|
88 |
+
|
89 |
+
compute_encoder_accuracy(items, enc_manager)
|
TTS/bin/extract_tts_spectrograms.py
ADDED
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""Extract Mel spectrograms with teacher forcing."""
|
3 |
+
|
4 |
+
import argparse
|
5 |
+
import os
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
from torch.utils.data import DataLoader
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
from TTS.config import load_config
|
13 |
+
from TTS.tts.datasets import TTSDataset, load_tts_samples
|
14 |
+
from TTS.tts.models import setup_model
|
15 |
+
from TTS.tts.utils.speakers import SpeakerManager
|
16 |
+
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
17 |
+
from TTS.utils.audio import AudioProcessor
|
18 |
+
from TTS.utils.generic_utils import count_parameters
|
19 |
+
|
20 |
+
use_cuda = torch.cuda.is_available()
|
21 |
+
|
22 |
+
|
23 |
+
def setup_loader(ap, r, verbose=False):
|
24 |
+
tokenizer, _ = TTSTokenizer.init_from_config(c)
|
25 |
+
dataset = TTSDataset(
|
26 |
+
outputs_per_step=r,
|
27 |
+
compute_linear_spec=False,
|
28 |
+
samples=meta_data,
|
29 |
+
tokenizer=tokenizer,
|
30 |
+
ap=ap,
|
31 |
+
batch_group_size=0,
|
32 |
+
min_text_len=c.min_text_len,
|
33 |
+
max_text_len=c.max_text_len,
|
34 |
+
min_audio_len=c.min_audio_len,
|
35 |
+
max_audio_len=c.max_audio_len,
|
36 |
+
phoneme_cache_path=c.phoneme_cache_path,
|
37 |
+
precompute_num_workers=0,
|
38 |
+
use_noise_augment=False,
|
39 |
+
verbose=verbose,
|
40 |
+
speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None,
|
41 |
+
d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
|
42 |
+
)
|
43 |
+
|
44 |
+
if c.use_phonemes and c.compute_input_seq_cache:
|
45 |
+
# precompute phonemes to have a better estimate of sequence lengths.
|
46 |
+
dataset.compute_input_seq(c.num_loader_workers)
|
47 |
+
dataset.preprocess_samples()
|
48 |
+
|
49 |
+
loader = DataLoader(
|
50 |
+
dataset,
|
51 |
+
batch_size=c.batch_size,
|
52 |
+
shuffle=False,
|
53 |
+
collate_fn=dataset.collate_fn,
|
54 |
+
drop_last=False,
|
55 |
+
sampler=None,
|
56 |
+
num_workers=c.num_loader_workers,
|
57 |
+
pin_memory=False,
|
58 |
+
)
|
59 |
+
return loader
|
60 |
+
|
61 |
+
|
62 |
+
def set_filename(wav_path, out_path):
|
63 |
+
wav_file = os.path.basename(wav_path)
|
64 |
+
file_name = wav_file.split(".")[0]
|
65 |
+
os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
|
66 |
+
os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
|
67 |
+
os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
|
68 |
+
os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
|
69 |
+
wavq_path = os.path.join(out_path, "quant", file_name)
|
70 |
+
mel_path = os.path.join(out_path, "mel", file_name)
|
71 |
+
wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
|
72 |
+
wav_path = os.path.join(out_path, "wav", file_name + ".wav")
|
73 |
+
return file_name, wavq_path, mel_path, wav_gl_path, wav_path
|
74 |
+
|
75 |
+
|
76 |
+
def format_data(data):
|
77 |
+
# setup input data
|
78 |
+
text_input = data["token_id"]
|
79 |
+
text_lengths = data["token_id_lengths"]
|
80 |
+
mel_input = data["mel"]
|
81 |
+
mel_lengths = data["mel_lengths"]
|
82 |
+
item_idx = data["item_idxs"]
|
83 |
+
d_vectors = data["d_vectors"]
|
84 |
+
speaker_ids = data["speaker_ids"]
|
85 |
+
attn_mask = data["attns"]
|
86 |
+
avg_text_length = torch.mean(text_lengths.float())
|
87 |
+
avg_spec_length = torch.mean(mel_lengths.float())
|
88 |
+
|
89 |
+
# dispatch data to GPU
|
90 |
+
if use_cuda:
|
91 |
+
text_input = text_input.cuda(non_blocking=True)
|
92 |
+
text_lengths = text_lengths.cuda(non_blocking=True)
|
93 |
+
mel_input = mel_input.cuda(non_blocking=True)
|
94 |
+
mel_lengths = mel_lengths.cuda(non_blocking=True)
|
95 |
+
if speaker_ids is not None:
|
96 |
+
speaker_ids = speaker_ids.cuda(non_blocking=True)
|
97 |
+
if d_vectors is not None:
|
98 |
+
d_vectors = d_vectors.cuda(non_blocking=True)
|
99 |
+
if attn_mask is not None:
|
100 |
+
attn_mask = attn_mask.cuda(non_blocking=True)
|
101 |
+
return (
|
102 |
+
text_input,
|
103 |
+
text_lengths,
|
104 |
+
mel_input,
|
105 |
+
mel_lengths,
|
106 |
+
speaker_ids,
|
107 |
+
d_vectors,
|
108 |
+
avg_text_length,
|
109 |
+
avg_spec_length,
|
110 |
+
attn_mask,
|
111 |
+
item_idx,
|
112 |
+
)
|
113 |
+
|
114 |
+
|
115 |
+
@torch.no_grad()
|
116 |
+
def inference(
|
117 |
+
model_name,
|
118 |
+
model,
|
119 |
+
ap,
|
120 |
+
text_input,
|
121 |
+
text_lengths,
|
122 |
+
mel_input,
|
123 |
+
mel_lengths,
|
124 |
+
speaker_ids=None,
|
125 |
+
d_vectors=None,
|
126 |
+
):
|
127 |
+
if model_name == "glow_tts":
|
128 |
+
speaker_c = None
|
129 |
+
if speaker_ids is not None:
|
130 |
+
speaker_c = speaker_ids
|
131 |
+
elif d_vectors is not None:
|
132 |
+
speaker_c = d_vectors
|
133 |
+
outputs = model.inference_with_MAS(
|
134 |
+
text_input,
|
135 |
+
text_lengths,
|
136 |
+
mel_input,
|
137 |
+
mel_lengths,
|
138 |
+
aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
|
139 |
+
)
|
140 |
+
model_output = outputs["model_outputs"]
|
141 |
+
model_output = model_output.detach().cpu().numpy()
|
142 |
+
|
143 |
+
elif "tacotron" in model_name:
|
144 |
+
aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
|
145 |
+
outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
|
146 |
+
postnet_outputs = outputs["model_outputs"]
|
147 |
+
# normalize tacotron output
|
148 |
+
if model_name == "tacotron":
|
149 |
+
mel_specs = []
|
150 |
+
postnet_outputs = postnet_outputs.data.cpu().numpy()
|
151 |
+
for b in range(postnet_outputs.shape[0]):
|
152 |
+
postnet_output = postnet_outputs[b]
|
153 |
+
mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
|
154 |
+
model_output = torch.stack(mel_specs).cpu().numpy()
|
155 |
+
|
156 |
+
elif model_name == "tacotron2":
|
157 |
+
model_output = postnet_outputs.detach().cpu().numpy()
|
158 |
+
return model_output
|
159 |
+
|
160 |
+
|
161 |
+
def extract_spectrograms(
|
162 |
+
data_loader, model, ap, output_path, quantized_wav=False, save_audio=False, debug=False, metada_name="metada.txt"
|
163 |
+
):
|
164 |
+
model.eval()
|
165 |
+
export_metadata = []
|
166 |
+
for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
|
167 |
+
|
168 |
+
# format data
|
169 |
+
(
|
170 |
+
text_input,
|
171 |
+
text_lengths,
|
172 |
+
mel_input,
|
173 |
+
mel_lengths,
|
174 |
+
speaker_ids,
|
175 |
+
d_vectors,
|
176 |
+
_,
|
177 |
+
_,
|
178 |
+
_,
|
179 |
+
item_idx,
|
180 |
+
) = format_data(data)
|
181 |
+
|
182 |
+
model_output = inference(
|
183 |
+
c.model.lower(),
|
184 |
+
model,
|
185 |
+
ap,
|
186 |
+
text_input,
|
187 |
+
text_lengths,
|
188 |
+
mel_input,
|
189 |
+
mel_lengths,
|
190 |
+
speaker_ids,
|
191 |
+
d_vectors,
|
192 |
+
)
|
193 |
+
|
194 |
+
for idx in range(text_input.shape[0]):
|
195 |
+
wav_file_path = item_idx[idx]
|
196 |
+
wav = ap.load_wav(wav_file_path)
|
197 |
+
_, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
|
198 |
+
|
199 |
+
# quantize and save wav
|
200 |
+
if quantized_wav:
|
201 |
+
wavq = ap.quantize(wav)
|
202 |
+
np.save(wavq_path, wavq)
|
203 |
+
|
204 |
+
# save TTS mel
|
205 |
+
mel = model_output[idx]
|
206 |
+
mel_length = mel_lengths[idx]
|
207 |
+
mel = mel[:mel_length, :].T
|
208 |
+
np.save(mel_path, mel)
|
209 |
+
|
210 |
+
export_metadata.append([wav_file_path, mel_path])
|
211 |
+
if save_audio:
|
212 |
+
ap.save_wav(wav, wav_path)
|
213 |
+
|
214 |
+
if debug:
|
215 |
+
print("Audio for debug saved at:", wav_gl_path)
|
216 |
+
wav = ap.inv_melspectrogram(mel)
|
217 |
+
ap.save_wav(wav, wav_gl_path)
|
218 |
+
|
219 |
+
with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
|
220 |
+
for data in export_metadata:
|
221 |
+
f.write(f"{data[0]}|{data[1]+'.npy'}\n")
|
222 |
+
|
223 |
+
|
224 |
+
def main(args): # pylint: disable=redefined-outer-name
|
225 |
+
# pylint: disable=global-variable-undefined
|
226 |
+
global meta_data, speaker_manager
|
227 |
+
|
228 |
+
# Audio processor
|
229 |
+
ap = AudioProcessor(**c.audio)
|
230 |
+
|
231 |
+
# load data instances
|
232 |
+
meta_data_train, meta_data_eval = load_tts_samples(
|
233 |
+
c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
|
234 |
+
)
|
235 |
+
|
236 |
+
# use eval and training partitions
|
237 |
+
meta_data = meta_data_train + meta_data_eval
|
238 |
+
|
239 |
+
# init speaker manager
|
240 |
+
if c.use_speaker_embedding:
|
241 |
+
speaker_manager = SpeakerManager(data_items=meta_data)
|
242 |
+
elif c.use_d_vector_file:
|
243 |
+
speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
|
244 |
+
else:
|
245 |
+
speaker_manager = None
|
246 |
+
|
247 |
+
# setup model
|
248 |
+
model = setup_model(c)
|
249 |
+
|
250 |
+
# restore model
|
251 |
+
model.load_checkpoint(c, args.checkpoint_path, eval=True)
|
252 |
+
|
253 |
+
if use_cuda:
|
254 |
+
model.cuda()
|
255 |
+
|
256 |
+
num_params = count_parameters(model)
|
257 |
+
print("\n > Model has {} parameters".format(num_params), flush=True)
|
258 |
+
# set r
|
259 |
+
r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
|
260 |
+
own_loader = setup_loader(ap, r, verbose=True)
|
261 |
+
|
262 |
+
extract_spectrograms(
|
263 |
+
own_loader,
|
264 |
+
model,
|
265 |
+
ap,
|
266 |
+
args.output_path,
|
267 |
+
quantized_wav=args.quantized,
|
268 |
+
save_audio=args.save_audio,
|
269 |
+
debug=args.debug,
|
270 |
+
metada_name="metada.txt",
|
271 |
+
)
|
272 |
+
|
273 |
+
|
274 |
+
if __name__ == "__main__":
|
275 |
+
parser = argparse.ArgumentParser()
|
276 |
+
parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
|
277 |
+
parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
|
278 |
+
parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
|
279 |
+
parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
|
280 |
+
parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
|
281 |
+
parser.add_argument("--quantized", action="store_true", help="Save quantized audio files")
|
282 |
+
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
|
283 |
+
args = parser.parse_args()
|
284 |
+
|
285 |
+
c = load_config(args.config_path)
|
286 |
+
c.audio.trim_silence = False
|
287 |
+
main(args)
|
TTS/bin/find_unique_chars.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Find all the unique characters in a dataset"""
|
2 |
+
import argparse
|
3 |
+
from argparse import RawTextHelpFormatter
|
4 |
+
|
5 |
+
from TTS.config import load_config
|
6 |
+
from TTS.tts.datasets import load_tts_samples
|
7 |
+
|
8 |
+
|
9 |
+
def main():
|
10 |
+
# pylint: disable=bad-option-value
|
11 |
+
parser = argparse.ArgumentParser(
|
12 |
+
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
|
13 |
+
"""
|
14 |
+
Example runs:
|
15 |
+
|
16 |
+
python TTS/bin/find_unique_chars.py --config_path config.json
|
17 |
+
""",
|
18 |
+
formatter_class=RawTextHelpFormatter,
|
19 |
+
)
|
20 |
+
parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
|
21 |
+
args = parser.parse_args()
|
22 |
+
|
23 |
+
c = load_config(args.config_path)
|
24 |
+
|
25 |
+
# load all datasets
|
26 |
+
train_items, eval_items = load_tts_samples(
|
27 |
+
c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
|
28 |
+
)
|
29 |
+
|
30 |
+
items = train_items + eval_items
|
31 |
+
|
32 |
+
texts = "".join(item["text"] for item in items)
|
33 |
+
chars = set(texts)
|
34 |
+
lower_chars = filter(lambda c: c.islower(), chars)
|
35 |
+
chars_force_lower = [c.lower() for c in chars]
|
36 |
+
chars_force_lower = set(chars_force_lower)
|
37 |
+
|
38 |
+
print(f" > Number of unique characters: {len(chars)}")
|
39 |
+
print(f" > Unique characters: {''.join(sorted(chars))}")
|
40 |
+
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
|
41 |
+
print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
|
42 |
+
|
43 |
+
|
44 |
+
if __name__ == "__main__":
|
45 |
+
main()
|
TTS/bin/find_unique_phonemes.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Find all the unique characters in a dataset"""
|
2 |
+
import argparse
|
3 |
+
import multiprocessing
|
4 |
+
from argparse import RawTextHelpFormatter
|
5 |
+
|
6 |
+
from tqdm.contrib.concurrent import process_map
|
7 |
+
|
8 |
+
from TTS.config import load_config
|
9 |
+
from TTS.tts.datasets import load_tts_samples
|
10 |
+
from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
|
11 |
+
|
12 |
+
phonemizer = Gruut(language="en-us")
|
13 |
+
|
14 |
+
|
15 |
+
def compute_phonemes(item):
|
16 |
+
try:
|
17 |
+
text = item[0]
|
18 |
+
ph = phonemizer.phonemize(text).split("|")
|
19 |
+
except:
|
20 |
+
return []
|
21 |
+
return list(set(ph))
|
22 |
+
|
23 |
+
|
24 |
+
def main():
|
25 |
+
# pylint: disable=W0601
|
26 |
+
global c
|
27 |
+
# pylint: disable=bad-option-value
|
28 |
+
parser = argparse.ArgumentParser(
|
29 |
+
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
|
30 |
+
"""
|
31 |
+
Example runs:
|
32 |
+
|
33 |
+
python TTS/bin/find_unique_chars.py --config_path config.json
|
34 |
+
""",
|
35 |
+
formatter_class=RawTextHelpFormatter,
|
36 |
+
)
|
37 |
+
parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
|
38 |
+
args = parser.parse_args()
|
39 |
+
|
40 |
+
c = load_config(args.config_path)
|
41 |
+
|
42 |
+
# load all datasets
|
43 |
+
train_items, eval_items = load_tts_samples(
|
44 |
+
c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
|
45 |
+
)
|
46 |
+
items = train_items + eval_items
|
47 |
+
print("Num items:", len(items))
|
48 |
+
|
49 |
+
is_lang_def = all(item["language"] for item in items)
|
50 |
+
|
51 |
+
if not c.phoneme_language or not is_lang_def:
|
52 |
+
raise ValueError("Phoneme language must be defined in config.")
|
53 |
+
|
54 |
+
phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
|
55 |
+
phones = []
|
56 |
+
for ph in phonemes:
|
57 |
+
phones.extend(ph)
|
58 |
+
phones = set(phones)
|
59 |
+
lower_phones = filter(lambda c: c.islower(), phones)
|
60 |
+
phones_force_lower = [c.lower() for c in phones]
|
61 |
+
phones_force_lower = set(phones_force_lower)
|
62 |
+
|
63 |
+
print(f" > Number of unique phonemes: {len(phones)}")
|
64 |
+
print(f" > Unique phonemes: {''.join(sorted(phones))}")
|
65 |
+
print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
|
66 |
+
print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
|
67 |
+
|
68 |
+
|
69 |
+
if __name__ == "__main__":
|
70 |
+
main()
|
TTS/bin/remove_silence_using_vad.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import glob
|
3 |
+
import os
|
4 |
+
import pathlib
|
5 |
+
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
+
from TTS.utils.vad import get_vad_model_and_utils, remove_silence
|
9 |
+
|
10 |
+
|
11 |
+
def adjust_path_and_remove_silence(audio_path):
|
12 |
+
output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
|
13 |
+
# ignore if the file exists
|
14 |
+
if os.path.exists(output_path) and not args.force:
|
15 |
+
return output_path
|
16 |
+
|
17 |
+
# create all directory structure
|
18 |
+
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
19 |
+
# remove the silence and save the audio
|
20 |
+
output_path = remove_silence(
|
21 |
+
model_and_utils,
|
22 |
+
audio_path,
|
23 |
+
output_path,
|
24 |
+
trim_just_beginning_and_end=args.trim_just_beginning_and_end,
|
25 |
+
use_cuda=args.use_cuda,
|
26 |
+
)
|
27 |
+
|
28 |
+
return output_path
|
29 |
+
|
30 |
+
|
31 |
+
def preprocess_audios():
|
32 |
+
files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
|
33 |
+
print("> Number of files: ", len(files))
|
34 |
+
if not args.force:
|
35 |
+
print("> Ignoring files that already exist in the output directory.")
|
36 |
+
|
37 |
+
if args.trim_just_beginning_and_end:
|
38 |
+
print("> Trimming just the beginning and the end with nonspeech parts.")
|
39 |
+
else:
|
40 |
+
print("> Trimming all nonspeech parts.")
|
41 |
+
|
42 |
+
if files:
|
43 |
+
# create threads
|
44 |
+
# num_threads = multiprocessing.cpu_count()
|
45 |
+
# process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
|
46 |
+
for f in tqdm(files):
|
47 |
+
adjust_path_and_remove_silence(f)
|
48 |
+
else:
|
49 |
+
print("> No files Found !")
|
50 |
+
|
51 |
+
|
52 |
+
if __name__ == "__main__":
|
53 |
+
parser = argparse.ArgumentParser(
|
54 |
+
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
|
55 |
+
)
|
56 |
+
parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
|
57 |
+
parser.add_argument(
|
58 |
+
"-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
|
59 |
+
)
|
60 |
+
parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
|
61 |
+
parser.add_argument(
|
62 |
+
"-g",
|
63 |
+
"--glob",
|
64 |
+
type=str,
|
65 |
+
default="**/*.wav",
|
66 |
+
help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
|
67 |
+
)
|
68 |
+
parser.add_argument(
|
69 |
+
"-t",
|
70 |
+
"--trim_just_beginning_and_end",
|
71 |
+
type=bool,
|
72 |
+
default=True,
|
73 |
+
help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
|
74 |
+
)
|
75 |
+
parser.add_argument(
|
76 |
+
"-c",
|
77 |
+
"--use_cuda",
|
78 |
+
type=bool,
|
79 |
+
default=False,
|
80 |
+
help="If True use cuda",
|
81 |
+
)
|
82 |
+
args = parser.parse_args()
|
83 |
+
# load the model and utils
|
84 |
+
model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
|
85 |
+
preprocess_audios()
|
TTS/bin/resample.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import glob
|
3 |
+
import os
|
4 |
+
from argparse import RawTextHelpFormatter
|
5 |
+
from distutils.dir_util import copy_tree
|
6 |
+
from multiprocessing import Pool
|
7 |
+
|
8 |
+
import librosa
|
9 |
+
import soundfile as sf
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
|
13 |
+
def resample_file(func_args):
|
14 |
+
filename, output_sr = func_args
|
15 |
+
y, sr = librosa.load(filename, sr=output_sr)
|
16 |
+
sf.write(filename, y, sr)
|
17 |
+
|
18 |
+
|
19 |
+
if __name__ == "__main__":
|
20 |
+
|
21 |
+
parser = argparse.ArgumentParser(
|
22 |
+
description="""Resample a folder recusively with librosa
|
23 |
+
Can be used in place or create a copy of the folder as an output.\n\n
|
24 |
+
Example run:
|
25 |
+
python TTS/bin/resample.py
|
26 |
+
--input_dir /root/LJSpeech-1.1/
|
27 |
+
--output_sr 22050
|
28 |
+
--output_dir /root/resampled_LJSpeech-1.1/
|
29 |
+
--file_ext wav
|
30 |
+
--n_jobs 24
|
31 |
+
""",
|
32 |
+
formatter_class=RawTextHelpFormatter,
|
33 |
+
)
|
34 |
+
|
35 |
+
parser.add_argument(
|
36 |
+
"--input_dir",
|
37 |
+
type=str,
|
38 |
+
default=None,
|
39 |
+
required=True,
|
40 |
+
help="Path of the folder containing the audio files to resample",
|
41 |
+
)
|
42 |
+
|
43 |
+
parser.add_argument(
|
44 |
+
"--output_sr",
|
45 |
+
type=int,
|
46 |
+
default=22050,
|
47 |
+
required=False,
|
48 |
+
help="Samlple rate to which the audio files should be resampled",
|
49 |
+
)
|
50 |
+
|
51 |
+
parser.add_argument(
|
52 |
+
"--output_dir",
|
53 |
+
type=str,
|
54 |
+
default=None,
|
55 |
+
required=False,
|
56 |
+
help="Path of the destination folder. If not defined, the operation is done in place",
|
57 |
+
)
|
58 |
+
|
59 |
+
parser.add_argument(
|
60 |
+
"--file_ext",
|
61 |
+
type=str,
|
62 |
+
default="wav",
|
63 |
+
required=False,
|
64 |
+
help="Extension of the audio files to resample",
|
65 |
+
)
|
66 |
+
|
67 |
+
parser.add_argument(
|
68 |
+
"--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
|
69 |
+
)
|
70 |
+
|
71 |
+
args = parser.parse_args()
|
72 |
+
|
73 |
+
if args.output_dir:
|
74 |
+
print("Recursively copying the input folder...")
|
75 |
+
copy_tree(args.input_dir, args.output_dir)
|
76 |
+
args.input_dir = args.output_dir
|
77 |
+
|
78 |
+
print("Resampling the audio files...")
|
79 |
+
audio_files = glob.glob(os.path.join(args.input_dir, f"**/*.{args.file_ext}"), recursive=True)
|
80 |
+
print(f"Found {len(audio_files)} files...")
|
81 |
+
audio_files = list(zip(audio_files, len(audio_files) * [args.output_sr]))
|
82 |
+
with Pool(processes=args.n_jobs) as p:
|
83 |
+
with tqdm(total=len(audio_files)) as pbar:
|
84 |
+
for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
|
85 |
+
pbar.update()
|
86 |
+
|
87 |
+
print("Done !")
|
TTS/bin/synthesize.py
ADDED
@@ -0,0 +1,425 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import argparse
|
5 |
+
import sys
|
6 |
+
import pandas as pd
|
7 |
+
from argparse import RawTextHelpFormatter
|
8 |
+
|
9 |
+
# pylint: disable=redefined-outer-name, unused-argument
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
from TTS.utils.manage import ModelManager
|
13 |
+
from TTS.utils.synthesizer import Synthesizer
|
14 |
+
from tqdm.auto import tqdm
|
15 |
+
|
16 |
+
|
17 |
+
def str2bool(v):
|
18 |
+
if isinstance(v, bool):
|
19 |
+
return v
|
20 |
+
if v.lower() in ("yes", "true", "t", "y", "1"):
|
21 |
+
return True
|
22 |
+
if v.lower() in ("no", "false", "f", "n", "0"):
|
23 |
+
return False
|
24 |
+
raise argparse.ArgumentTypeError("Boolean value expected.")
|
25 |
+
|
26 |
+
|
27 |
+
def main():
|
28 |
+
description = """Synthesize speech on command line.
|
29 |
+
|
30 |
+
You can either use your trained model or choose a model from the provided list.
|
31 |
+
|
32 |
+
If you don't specify any models, then it uses LJSpeech based English model.
|
33 |
+
|
34 |
+
## Example Runs
|
35 |
+
|
36 |
+
### Single Speaker Models
|
37 |
+
|
38 |
+
- List provided models:
|
39 |
+
|
40 |
+
```
|
41 |
+
$ tts --list_models
|
42 |
+
```
|
43 |
+
|
44 |
+
- Query info for model info by idx:
|
45 |
+
|
46 |
+
```
|
47 |
+
$ tts --model_info_by_idx "<model_type>/<model_query_idx>"
|
48 |
+
```
|
49 |
+
|
50 |
+
- Query info for model info by full name:
|
51 |
+
|
52 |
+
```
|
53 |
+
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
|
54 |
+
```
|
55 |
+
|
56 |
+
- Run TTS with default models:
|
57 |
+
|
58 |
+
```
|
59 |
+
$ tts --text "Text for TTS"
|
60 |
+
```
|
61 |
+
|
62 |
+
- Run a TTS model with its default vocoder model:
|
63 |
+
|
64 |
+
```
|
65 |
+
$ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
|
66 |
+
```
|
67 |
+
|
68 |
+
- Run with specific TTS and vocoder models from the list:
|
69 |
+
|
70 |
+
```
|
71 |
+
$ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
|
72 |
+
```
|
73 |
+
|
74 |
+
- Run your own TTS model (Using Griffin-Lim Vocoder):
|
75 |
+
|
76 |
+
```
|
77 |
+
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
|
78 |
+
```
|
79 |
+
|
80 |
+
- Run your own TTS and Vocoder models:
|
81 |
+
```
|
82 |
+
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
|
83 |
+
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
|
84 |
+
```
|
85 |
+
|
86 |
+
### Multi-speaker Models
|
87 |
+
|
88 |
+
- List the available speakers and choose as <speaker_id> among them:
|
89 |
+
|
90 |
+
```
|
91 |
+
$ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
|
92 |
+
```
|
93 |
+
|
94 |
+
- Run the multi-speaker TTS model with the target speaker ID:
|
95 |
+
|
96 |
+
```
|
97 |
+
$ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
|
98 |
+
```
|
99 |
+
|
100 |
+
- Run your own multi-speaker TTS model:
|
101 |
+
|
102 |
+
```
|
103 |
+
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
104 |
+
```
|
105 |
+
"""
|
106 |
+
# We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
|
107 |
+
# documentation in sync more easily.
|
108 |
+
parser = argparse.ArgumentParser(
|
109 |
+
description=description.replace(" ```\n", ""),
|
110 |
+
formatter_class=RawTextHelpFormatter,
|
111 |
+
)
|
112 |
+
|
113 |
+
parser.add_argument(
|
114 |
+
"--list_models",
|
115 |
+
type=str2bool,
|
116 |
+
nargs="?",
|
117 |
+
const=True,
|
118 |
+
default=False,
|
119 |
+
help="list available pre-trained TTS and vocoder models.",
|
120 |
+
)
|
121 |
+
|
122 |
+
parser.add_argument(
|
123 |
+
"--model_info_by_idx",
|
124 |
+
type=str,
|
125 |
+
default=None,
|
126 |
+
help="model info using query format: <model_type>/<model_query_idx>",
|
127 |
+
)
|
128 |
+
|
129 |
+
parser.add_argument(
|
130 |
+
"--model_info_by_name",
|
131 |
+
type=str,
|
132 |
+
default=None,
|
133 |
+
help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
|
134 |
+
)
|
135 |
+
|
136 |
+
parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
|
137 |
+
|
138 |
+
#parser.add_argument("--text_file_path", type=str, default=None, help="A csv file in LJSpeech format ('|' seperated id, text and speaker) to generate speech.")
|
139 |
+
#parser.add_argument("--speaker_name_filter", type=str, default=None, help="Filter texts corresponding to a specific speaker in text_file_path ")
|
140 |
+
|
141 |
+
# Args for running pre-trained TTS models.
|
142 |
+
parser.add_argument(
|
143 |
+
"--model_name",
|
144 |
+
type=str,
|
145 |
+
default="tts_models/en/ljspeech/tacotron2-DDC",
|
146 |
+
help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
|
147 |
+
)
|
148 |
+
parser.add_argument(
|
149 |
+
"--vocoder_name",
|
150 |
+
type=str,
|
151 |
+
default=None,
|
152 |
+
help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>",
|
153 |
+
)
|
154 |
+
|
155 |
+
# Args for running custom models
|
156 |
+
parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
|
157 |
+
parser.add_argument(
|
158 |
+
"--model_path",
|
159 |
+
type=str,
|
160 |
+
default=None,
|
161 |
+
help="Path to model file.",
|
162 |
+
)
|
163 |
+
parser.add_argument(
|
164 |
+
"--out_path",
|
165 |
+
type=str,
|
166 |
+
default="tts_output.wav",
|
167 |
+
help="Output wav file path.",
|
168 |
+
)
|
169 |
+
|
170 |
+
# parser.add_argument(
|
171 |
+
# "--out_folder",
|
172 |
+
# type=str,
|
173 |
+
# default="tts_output",
|
174 |
+
# help="Output wav files folder.",
|
175 |
+
# )
|
176 |
+
|
177 |
+
parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
|
178 |
+
parser.add_argument(
|
179 |
+
"--vocoder_path",
|
180 |
+
type=str,
|
181 |
+
help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
|
182 |
+
default=None,
|
183 |
+
)
|
184 |
+
parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
|
185 |
+
parser.add_argument(
|
186 |
+
"--encoder_path",
|
187 |
+
type=str,
|
188 |
+
help="Path to speaker encoder model file.",
|
189 |
+
default=None,
|
190 |
+
)
|
191 |
+
parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
|
192 |
+
|
193 |
+
# args for multi-speaker synthesis
|
194 |
+
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
195 |
+
parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
|
196 |
+
parser.add_argument(
|
197 |
+
"--speaker_idx",
|
198 |
+
type=str,
|
199 |
+
help="Target speaker ID for a multi-speaker TTS model.",
|
200 |
+
default=None,
|
201 |
+
)
|
202 |
+
parser.add_argument(
|
203 |
+
"--language_idx",
|
204 |
+
type=str,
|
205 |
+
help="Target language ID for a multi-lingual TTS model.",
|
206 |
+
default=None,
|
207 |
+
)
|
208 |
+
parser.add_argument(
|
209 |
+
"--speaker_wav",
|
210 |
+
nargs="+",
|
211 |
+
help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
|
212 |
+
default=None,
|
213 |
+
)
|
214 |
+
parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
|
215 |
+
parser.add_argument(
|
216 |
+
"--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
|
217 |
+
)
|
218 |
+
parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
|
219 |
+
parser.add_argument(
|
220 |
+
"--list_speaker_idxs",
|
221 |
+
help="List available speaker ids for the defined multi-speaker model.",
|
222 |
+
type=str2bool,
|
223 |
+
nargs="?",
|
224 |
+
const=True,
|
225 |
+
default=False,
|
226 |
+
)
|
227 |
+
parser.add_argument(
|
228 |
+
"--list_language_idxs",
|
229 |
+
help="List available language ids for the defined multi-lingual model.",
|
230 |
+
type=str2bool,
|
231 |
+
nargs="?",
|
232 |
+
const=True,
|
233 |
+
default=False,
|
234 |
+
)
|
235 |
+
# aux args
|
236 |
+
parser.add_argument(
|
237 |
+
"--save_spectogram",
|
238 |
+
type=bool,
|
239 |
+
help="If true save raw spectogram for further (vocoder) processing in out_path.",
|
240 |
+
default=False,
|
241 |
+
)
|
242 |
+
parser.add_argument(
|
243 |
+
"--reference_wav",
|
244 |
+
type=str,
|
245 |
+
help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
|
246 |
+
default=None,
|
247 |
+
)
|
248 |
+
parser.add_argument(
|
249 |
+
"--reference_speaker_idx",
|
250 |
+
type=str,
|
251 |
+
help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
|
252 |
+
default=None,
|
253 |
+
)
|
254 |
+
args = parser.parse_args()
|
255 |
+
|
256 |
+
# print the description if either text or list_models is not set
|
257 |
+
check_args = [
|
258 |
+
args.text,
|
259 |
+
args.list_models,
|
260 |
+
args.list_speaker_idxs,
|
261 |
+
args.list_language_idxs,
|
262 |
+
args.reference_wav,
|
263 |
+
args.model_info_by_idx,
|
264 |
+
args.model_info_by_name,
|
265 |
+
]
|
266 |
+
if not any(check_args):
|
267 |
+
parser.parse_args(["-h"])
|
268 |
+
|
269 |
+
# load model manager
|
270 |
+
path = Path(__file__).parent / "../.models.json"
|
271 |
+
manager = ModelManager(path)
|
272 |
+
|
273 |
+
model_path = None
|
274 |
+
config_path = None
|
275 |
+
speakers_file_path = None
|
276 |
+
language_ids_file_path = None
|
277 |
+
vocoder_path = None
|
278 |
+
vocoder_config_path = None
|
279 |
+
encoder_path = None
|
280 |
+
encoder_config_path = None
|
281 |
+
|
282 |
+
# CASE1 #list : list pre-trained TTS models
|
283 |
+
if args.list_models:
|
284 |
+
manager.list_models()
|
285 |
+
sys.exit()
|
286 |
+
|
287 |
+
# CASE2 #info : model info of pre-trained TTS models
|
288 |
+
if args.model_info_by_idx:
|
289 |
+
model_query = args.model_info_by_idx
|
290 |
+
manager.model_info_by_idx(model_query)
|
291 |
+
sys.exit()
|
292 |
+
|
293 |
+
if args.model_info_by_name:
|
294 |
+
model_query_full_name = args.model_info_by_name
|
295 |
+
manager.model_info_by_full_name(model_query_full_name)
|
296 |
+
sys.exit()
|
297 |
+
|
298 |
+
# CASE3: load pre-trained model paths
|
299 |
+
if args.model_name is not None and not args.model_path:
|
300 |
+
model_path, config_path, model_item = manager.download_model(args.model_name)
|
301 |
+
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
|
302 |
+
|
303 |
+
if args.vocoder_name is not None and not args.vocoder_path:
|
304 |
+
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
|
305 |
+
|
306 |
+
# CASE4: set custom model paths
|
307 |
+
if args.model_path is not None:
|
308 |
+
model_path = args.model_path
|
309 |
+
config_path = args.config_path
|
310 |
+
speakers_file_path = args.speakers_file_path
|
311 |
+
language_ids_file_path = args.language_ids_file_path
|
312 |
+
|
313 |
+
if args.vocoder_path is not None:
|
314 |
+
vocoder_path = args.vocoder_path
|
315 |
+
vocoder_config_path = args.vocoder_config_path
|
316 |
+
|
317 |
+
if args.encoder_path is not None:
|
318 |
+
encoder_path = args.encoder_path
|
319 |
+
encoder_config_path = args.encoder_config_path
|
320 |
+
|
321 |
+
# load models
|
322 |
+
synthesizer = Synthesizer(
|
323 |
+
model_path,
|
324 |
+
config_path,
|
325 |
+
speakers_file_path,
|
326 |
+
language_ids_file_path,
|
327 |
+
vocoder_path,
|
328 |
+
vocoder_config_path,
|
329 |
+
encoder_path,
|
330 |
+
encoder_config_path,
|
331 |
+
args.use_cuda,
|
332 |
+
)
|
333 |
+
|
334 |
+
# query speaker ids of a multi-speaker model.
|
335 |
+
if args.list_speaker_idxs:
|
336 |
+
print(
|
337 |
+
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
|
338 |
+
)
|
339 |
+
print(synthesizer.tts_model.speaker_manager.ids)
|
340 |
+
return
|
341 |
+
|
342 |
+
# query langauge ids of a multi-lingual model.
|
343 |
+
if args.list_language_idxs:
|
344 |
+
print(
|
345 |
+
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
|
346 |
+
)
|
347 |
+
print(synthesizer.tts_model.language_manager.ids)
|
348 |
+
return
|
349 |
+
|
350 |
+
# check the arguments against a multi-speaker model.
|
351 |
+
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
|
352 |
+
print(
|
353 |
+
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
|
354 |
+
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
|
355 |
+
)
|
356 |
+
return
|
357 |
+
|
358 |
+
# RUN THE SYNTHESIS
|
359 |
+
if args.text.endswith('.csv'):
|
360 |
+
df = pd.read_csv(args.text, sep='|')
|
361 |
+
num_cols = df.shape[1]
|
362 |
+
columns = ['id', 'text', 'speaker_name', 'gender', 'text_len', 'audio_len', 'speaker_wav'][:num_cols]
|
363 |
+
df = pd.read_csv(args.text, sep='|', names=columns)
|
364 |
+
df = df.head(10)
|
365 |
+
|
366 |
+
# print(f'Number of examples before speaker filter: {len(df)}')
|
367 |
+
# if args.speaker_name_filter:
|
368 |
+
# df = df[df['speaker_name']==args.speaker_name_filter]
|
369 |
+
# print(f'Number of examples after speaker filter: {len(df)}')
|
370 |
+
|
371 |
+
if len(df) == 0:
|
372 |
+
raise ValueError("No records found.")
|
373 |
+
|
374 |
+
if 'speaker_wav' in df.columns:
|
375 |
+
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Synthesizing"):
|
376 |
+
wav = synthesizer.tts(
|
377 |
+
text=row['text'],
|
378 |
+
speaker_name=None,
|
379 |
+
language_name=args.language_idx,
|
380 |
+
speaker_wav=row['speaker_wav'],
|
381 |
+
reference_wav=args.reference_wav,
|
382 |
+
style_wav=args.capacitron_style_wav,
|
383 |
+
style_text=args.capacitron_style_text,
|
384 |
+
reference_speaker_name=args.reference_speaker_idx,
|
385 |
+
)
|
386 |
+
synthesizer.save_wav(wav, f'{args.out_path}/{row["id"]}.wav')
|
387 |
+
else:
|
388 |
+
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Synthesizing"):
|
389 |
+
wav = synthesizer.tts(
|
390 |
+
row['text'],
|
391 |
+
row['speaker_name'] if 'speaker_name' in df.columns else args.speaker_idx,
|
392 |
+
args.language_idx,
|
393 |
+
args.speaker_wav,
|
394 |
+
reference_wav=args.reference_wav,
|
395 |
+
style_wav=args.capacitron_style_wav,
|
396 |
+
style_text=args.capacitron_style_text,
|
397 |
+
reference_speaker_name=args.reference_speaker_idx,
|
398 |
+
)
|
399 |
+
synthesizer.save_wav(wav, f'{args.out_path}/{row["id"]}.wav')
|
400 |
+
print(" > Saved output wav files in {}".format(args.out_path))
|
401 |
+
return True
|
402 |
+
|
403 |
+
if args.text:
|
404 |
+
print(" > Text: {}".format(args.text))
|
405 |
+
|
406 |
+
|
407 |
+
# kick it
|
408 |
+
wav = synthesizer.tts(
|
409 |
+
args.text,
|
410 |
+
args.speaker_idx,
|
411 |
+
args.language_idx,
|
412 |
+
args.speaker_wav,
|
413 |
+
reference_wav=args.reference_wav,
|
414 |
+
style_wav=args.capacitron_style_wav,
|
415 |
+
style_text=args.capacitron_style_text,
|
416 |
+
reference_speaker_name=args.reference_speaker_idx,
|
417 |
+
)
|
418 |
+
|
419 |
+
# save the results
|
420 |
+
print(" > Saving output to {}".format(args.out_path))
|
421 |
+
synthesizer.save_wav(wav, args.out_path)
|
422 |
+
|
423 |
+
|
424 |
+
if __name__ == "__main__":
|
425 |
+
main()
|
TTS/bin/train_encoder.py
ADDED
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
import time
|
7 |
+
import traceback
|
8 |
+
|
9 |
+
import torch
|
10 |
+
from torch.utils.data import DataLoader
|
11 |
+
from trainer.torch import NoamLR
|
12 |
+
from trainer.trainer_utils import get_optimizer
|
13 |
+
|
14 |
+
from TTS.encoder.dataset import EncoderDataset
|
15 |
+
from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
|
16 |
+
from TTS.encoder.utils.samplers import PerfectBatchSampler
|
17 |
+
from TTS.encoder.utils.training import init_training
|
18 |
+
from TTS.encoder.utils.visual import plot_embeddings
|
19 |
+
from TTS.tts.datasets import load_tts_samples
|
20 |
+
from TTS.utils.audio import AudioProcessor
|
21 |
+
from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
|
22 |
+
from TTS.utils.io import copy_model_files
|
23 |
+
from TTS.utils.training import check_update
|
24 |
+
|
25 |
+
torch.backends.cudnn.enabled = True
|
26 |
+
torch.backends.cudnn.benchmark = True
|
27 |
+
torch.manual_seed(54321)
|
28 |
+
use_cuda = torch.cuda.is_available()
|
29 |
+
num_gpus = torch.cuda.device_count()
|
30 |
+
print(" > Using CUDA: ", use_cuda)
|
31 |
+
print(" > Number of GPUs: ", num_gpus)
|
32 |
+
|
33 |
+
|
34 |
+
def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
|
35 |
+
num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
|
36 |
+
num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
|
37 |
+
|
38 |
+
dataset = EncoderDataset(
|
39 |
+
c,
|
40 |
+
ap,
|
41 |
+
meta_data_eval if is_val else meta_data_train,
|
42 |
+
voice_len=c.voice_len,
|
43 |
+
num_utter_per_class=num_utter_per_class,
|
44 |
+
num_classes_in_batch=num_classes_in_batch,
|
45 |
+
verbose=verbose,
|
46 |
+
augmentation_config=c.audio_augmentation if not is_val else None,
|
47 |
+
use_torch_spec=c.model_params.get("use_torch_spec", False),
|
48 |
+
)
|
49 |
+
# get classes list
|
50 |
+
classes = dataset.get_class_list()
|
51 |
+
|
52 |
+
sampler = PerfectBatchSampler(
|
53 |
+
dataset.items,
|
54 |
+
classes,
|
55 |
+
batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
|
56 |
+
num_classes_in_batch=num_classes_in_batch,
|
57 |
+
num_gpus=1,
|
58 |
+
shuffle=not is_val,
|
59 |
+
drop_last=True,
|
60 |
+
)
|
61 |
+
|
62 |
+
if len(classes) < num_classes_in_batch:
|
63 |
+
if is_val:
|
64 |
+
raise RuntimeError(
|
65 |
+
f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
|
66 |
+
)
|
67 |
+
raise RuntimeError(
|
68 |
+
f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
|
69 |
+
)
|
70 |
+
|
71 |
+
# set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
|
72 |
+
if is_val:
|
73 |
+
dataset.set_classes(train_classes)
|
74 |
+
|
75 |
+
loader = DataLoader(
|
76 |
+
dataset,
|
77 |
+
num_workers=c.num_loader_workers,
|
78 |
+
batch_sampler=sampler,
|
79 |
+
collate_fn=dataset.collate_fn,
|
80 |
+
)
|
81 |
+
|
82 |
+
return loader, classes, dataset.get_map_classid_to_classname()
|
83 |
+
|
84 |
+
|
85 |
+
def evaluation(model, criterion, data_loader, global_step):
|
86 |
+
eval_loss = 0
|
87 |
+
for _, data in enumerate(data_loader):
|
88 |
+
with torch.no_grad():
|
89 |
+
# setup input data
|
90 |
+
inputs, labels = data
|
91 |
+
|
92 |
+
# agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
|
93 |
+
labels = torch.transpose(
|
94 |
+
labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
|
95 |
+
).reshape(labels.shape)
|
96 |
+
inputs = torch.transpose(
|
97 |
+
inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
|
98 |
+
).reshape(inputs.shape)
|
99 |
+
|
100 |
+
# dispatch data to GPU
|
101 |
+
if use_cuda:
|
102 |
+
inputs = inputs.cuda(non_blocking=True)
|
103 |
+
labels = labels.cuda(non_blocking=True)
|
104 |
+
|
105 |
+
# forward pass model
|
106 |
+
outputs = model(inputs)
|
107 |
+
|
108 |
+
# loss computation
|
109 |
+
loss = criterion(
|
110 |
+
outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
|
111 |
+
)
|
112 |
+
|
113 |
+
eval_loss += loss.item()
|
114 |
+
|
115 |
+
eval_avg_loss = eval_loss / len(data_loader)
|
116 |
+
# save stats
|
117 |
+
dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
|
118 |
+
# plot the last batch in the evaluation
|
119 |
+
figures = {
|
120 |
+
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
|
121 |
+
}
|
122 |
+
dashboard_logger.eval_figures(global_step, figures)
|
123 |
+
return eval_avg_loss
|
124 |
+
|
125 |
+
|
126 |
+
def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
|
127 |
+
model.train()
|
128 |
+
best_loss = float("inf")
|
129 |
+
avg_loader_time = 0
|
130 |
+
end_time = time.time()
|
131 |
+
for epoch in range(c.epochs):
|
132 |
+
tot_loss = 0
|
133 |
+
epoch_time = 0
|
134 |
+
for _, data in enumerate(data_loader):
|
135 |
+
start_time = time.time()
|
136 |
+
|
137 |
+
# setup input data
|
138 |
+
inputs, labels = data
|
139 |
+
# agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
|
140 |
+
labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
|
141 |
+
labels.shape
|
142 |
+
)
|
143 |
+
inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
|
144 |
+
inputs.shape
|
145 |
+
)
|
146 |
+
# ToDo: move it to a unit test
|
147 |
+
# labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
|
148 |
+
# inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
|
149 |
+
# idx = 0
|
150 |
+
# for j in range(0, c.num_classes_in_batch, 1):
|
151 |
+
# for i in range(j, len(labels), c.num_classes_in_batch):
|
152 |
+
# if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
|
153 |
+
# print("Invalid")
|
154 |
+
# print(labels)
|
155 |
+
# exit()
|
156 |
+
# idx += 1
|
157 |
+
# labels = labels_converted
|
158 |
+
# inputs = inputs_converted
|
159 |
+
|
160 |
+
loader_time = time.time() - end_time
|
161 |
+
global_step += 1
|
162 |
+
|
163 |
+
# setup lr
|
164 |
+
if c.lr_decay:
|
165 |
+
scheduler.step()
|
166 |
+
optimizer.zero_grad()
|
167 |
+
|
168 |
+
# dispatch data to GPU
|
169 |
+
if use_cuda:
|
170 |
+
inputs = inputs.cuda(non_blocking=True)
|
171 |
+
labels = labels.cuda(non_blocking=True)
|
172 |
+
|
173 |
+
# forward pass model
|
174 |
+
outputs = model(inputs)
|
175 |
+
|
176 |
+
# loss computation
|
177 |
+
loss = criterion(
|
178 |
+
outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
|
179 |
+
)
|
180 |
+
loss.backward()
|
181 |
+
grad_norm, _ = check_update(model, c.grad_clip)
|
182 |
+
optimizer.step()
|
183 |
+
|
184 |
+
step_time = time.time() - start_time
|
185 |
+
epoch_time += step_time
|
186 |
+
|
187 |
+
# acumulate the total epoch loss
|
188 |
+
tot_loss += loss.item()
|
189 |
+
|
190 |
+
# Averaged Loader Time
|
191 |
+
num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
|
192 |
+
avg_loader_time = (
|
193 |
+
1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
|
194 |
+
if avg_loader_time != 0
|
195 |
+
else loader_time
|
196 |
+
)
|
197 |
+
current_lr = optimizer.param_groups[0]["lr"]
|
198 |
+
|
199 |
+
if global_step % c.steps_plot_stats == 0:
|
200 |
+
# Plot Training Epoch Stats
|
201 |
+
train_stats = {
|
202 |
+
"loss": loss.item(),
|
203 |
+
"lr": current_lr,
|
204 |
+
"grad_norm": grad_norm,
|
205 |
+
"step_time": step_time,
|
206 |
+
"avg_loader_time": avg_loader_time,
|
207 |
+
}
|
208 |
+
dashboard_logger.train_epoch_stats(global_step, train_stats)
|
209 |
+
figures = {
|
210 |
+
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
|
211 |
+
}
|
212 |
+
dashboard_logger.train_figures(global_step, figures)
|
213 |
+
|
214 |
+
if global_step % c.print_step == 0:
|
215 |
+
print(
|
216 |
+
" | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
|
217 |
+
"StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
|
218 |
+
global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
|
219 |
+
),
|
220 |
+
flush=True,
|
221 |
+
)
|
222 |
+
|
223 |
+
if global_step % c.save_step == 0:
|
224 |
+
# save model
|
225 |
+
save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
|
226 |
+
|
227 |
+
end_time = time.time()
|
228 |
+
|
229 |
+
print("")
|
230 |
+
print(
|
231 |
+
">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
|
232 |
+
"EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
|
233 |
+
epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
|
234 |
+
),
|
235 |
+
flush=True,
|
236 |
+
)
|
237 |
+
# evaluation
|
238 |
+
if c.run_eval:
|
239 |
+
model.eval()
|
240 |
+
eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
|
241 |
+
print("\n\n")
|
242 |
+
print("--> EVAL PERFORMANCE")
|
243 |
+
print(
|
244 |
+
" | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
|
245 |
+
flush=True,
|
246 |
+
)
|
247 |
+
# save the best checkpoint
|
248 |
+
best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
|
249 |
+
model.train()
|
250 |
+
|
251 |
+
return best_loss, global_step
|
252 |
+
|
253 |
+
|
254 |
+
def main(args): # pylint: disable=redefined-outer-name
|
255 |
+
# pylint: disable=global-variable-undefined
|
256 |
+
global meta_data_train
|
257 |
+
global meta_data_eval
|
258 |
+
global train_classes
|
259 |
+
|
260 |
+
ap = AudioProcessor(**c.audio)
|
261 |
+
model = setup_encoder_model(c)
|
262 |
+
|
263 |
+
optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
|
264 |
+
|
265 |
+
# pylint: disable=redefined-outer-name
|
266 |
+
meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
|
267 |
+
|
268 |
+
train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
|
269 |
+
if c.run_eval:
|
270 |
+
eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
|
271 |
+
else:
|
272 |
+
eval_data_loader = None
|
273 |
+
|
274 |
+
num_classes = len(train_classes)
|
275 |
+
criterion = model.get_criterion(c, num_classes)
|
276 |
+
|
277 |
+
if c.loss == "softmaxproto" and c.model != "speaker_encoder":
|
278 |
+
c.map_classid_to_classname = map_classid_to_classname
|
279 |
+
copy_model_files(c, OUT_PATH)
|
280 |
+
|
281 |
+
if args.restore_path:
|
282 |
+
criterion, args.restore_step = model.load_checkpoint(
|
283 |
+
c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
|
284 |
+
)
|
285 |
+
print(" > Model restored from step %d" % args.restore_step, flush=True)
|
286 |
+
else:
|
287 |
+
args.restore_step = 0
|
288 |
+
|
289 |
+
if c.lr_decay:
|
290 |
+
scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
|
291 |
+
else:
|
292 |
+
scheduler = None
|
293 |
+
|
294 |
+
num_params = count_parameters(model)
|
295 |
+
print("\n > Model has {} parameters".format(num_params), flush=True)
|
296 |
+
|
297 |
+
if use_cuda:
|
298 |
+
model = model.cuda()
|
299 |
+
criterion.cuda()
|
300 |
+
|
301 |
+
global_step = args.restore_step
|
302 |
+
_, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
|
303 |
+
|
304 |
+
|
305 |
+
if __name__ == "__main__":
|
306 |
+
args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
|
307 |
+
|
308 |
+
try:
|
309 |
+
main(args)
|
310 |
+
except KeyboardInterrupt:
|
311 |
+
remove_experiment_folder(OUT_PATH)
|
312 |
+
try:
|
313 |
+
sys.exit(0)
|
314 |
+
except SystemExit:
|
315 |
+
os._exit(0) # pylint: disable=protected-access
|
316 |
+
except Exception: # pylint: disable=broad-except
|
317 |
+
remove_experiment_folder(OUT_PATH)
|
318 |
+
traceback.print_exc()
|
319 |
+
sys.exit(1)
|
TTS/bin/train_tts.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dataclasses import dataclass, field
|
3 |
+
|
4 |
+
from trainer import Trainer, TrainerArgs
|
5 |
+
|
6 |
+
from TTS.config import load_config, register_config
|
7 |
+
from TTS.tts.datasets import load_tts_samples
|
8 |
+
from TTS.tts.models import setup_model
|
9 |
+
|
10 |
+
|
11 |
+
@dataclass
|
12 |
+
class TrainTTSArgs(TrainerArgs):
|
13 |
+
config_path: str = field(default=None, metadata={"help": "Path to the config file."})
|
14 |
+
|
15 |
+
|
16 |
+
def main():
|
17 |
+
"""Run `tts` model training directly by a `config.json` file."""
|
18 |
+
# init trainer args
|
19 |
+
train_args = TrainTTSArgs()
|
20 |
+
parser = train_args.init_argparse(arg_prefix="")
|
21 |
+
|
22 |
+
# override trainer args from comman-line args
|
23 |
+
args, config_overrides = parser.parse_known_args()
|
24 |
+
train_args.parse_args(args)
|
25 |
+
|
26 |
+
# load config.json and register
|
27 |
+
if args.config_path or args.continue_path:
|
28 |
+
if args.config_path:
|
29 |
+
# init from a file
|
30 |
+
config = load_config(args.config_path)
|
31 |
+
if len(config_overrides) > 0:
|
32 |
+
config.parse_known_args(config_overrides, relaxed_parser=True)
|
33 |
+
elif args.continue_path:
|
34 |
+
# continue from a prev experiment
|
35 |
+
config = load_config(os.path.join(args.continue_path, "config.json"))
|
36 |
+
if len(config_overrides) > 0:
|
37 |
+
config.parse_known_args(config_overrides, relaxed_parser=True)
|
38 |
+
else:
|
39 |
+
# init from console args
|
40 |
+
from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
|
41 |
+
|
42 |
+
config_base = BaseTrainingConfig()
|
43 |
+
config_base.parse_known_args(config_overrides)
|
44 |
+
config = register_config(config_base.model)()
|
45 |
+
|
46 |
+
# load training samples
|
47 |
+
train_samples, eval_samples = load_tts_samples(
|
48 |
+
config.datasets,
|
49 |
+
eval_split=True,
|
50 |
+
eval_split_max_size=config.eval_split_max_size,
|
51 |
+
eval_split_size=config.eval_split_size,
|
52 |
+
)
|
53 |
+
|
54 |
+
# init the model from config
|
55 |
+
model = setup_model(config, train_samples + eval_samples)
|
56 |
+
|
57 |
+
# init the trainer and 🚀
|
58 |
+
trainer = Trainer(
|
59 |
+
train_args,
|
60 |
+
model.config,
|
61 |
+
config.output_path,
|
62 |
+
model=model,
|
63 |
+
train_samples=train_samples,
|
64 |
+
eval_samples=eval_samples,
|
65 |
+
parse_command_line_args=False,
|
66 |
+
)
|
67 |
+
trainer.fit()
|
68 |
+
|
69 |
+
|
70 |
+
if __name__ == "__main__":
|
71 |
+
main()
|
TTS/bin/train_vocoder.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dataclasses import dataclass, field
|
3 |
+
|
4 |
+
from trainer import Trainer, TrainerArgs
|
5 |
+
|
6 |
+
from TTS.config import load_config, register_config
|
7 |
+
from TTS.utils.audio import AudioProcessor
|
8 |
+
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
9 |
+
from TTS.vocoder.models import setup_model
|
10 |
+
|
11 |
+
|
12 |
+
@dataclass
|
13 |
+
class TrainVocoderArgs(TrainerArgs):
|
14 |
+
config_path: str = field(default=None, metadata={"help": "Path to the config file."})
|
15 |
+
|
16 |
+
|
17 |
+
def main():
|
18 |
+
"""Run `tts` model training directly by a `config.json` file."""
|
19 |
+
# init trainer args
|
20 |
+
train_args = TrainVocoderArgs()
|
21 |
+
parser = train_args.init_argparse(arg_prefix="")
|
22 |
+
|
23 |
+
# override trainer args from comman-line args
|
24 |
+
args, config_overrides = parser.parse_known_args()
|
25 |
+
train_args.parse_args(args)
|
26 |
+
|
27 |
+
# load config.json and register
|
28 |
+
if args.config_path or args.continue_path:
|
29 |
+
if args.config_path:
|
30 |
+
# init from a file
|
31 |
+
config = load_config(args.config_path)
|
32 |
+
if len(config_overrides) > 0:
|
33 |
+
config.parse_known_args(config_overrides, relaxed_parser=True)
|
34 |
+
elif args.continue_path:
|
35 |
+
# continue from a prev experiment
|
36 |
+
config = load_config(os.path.join(args.continue_path, "config.json"))
|
37 |
+
if len(config_overrides) > 0:
|
38 |
+
config.parse_known_args(config_overrides, relaxed_parser=True)
|
39 |
+
else:
|
40 |
+
# init from console args
|
41 |
+
from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
|
42 |
+
|
43 |
+
config_base = BaseTrainingConfig()
|
44 |
+
config_base.parse_known_args(config_overrides)
|
45 |
+
config = register_config(config_base.model)()
|
46 |
+
|
47 |
+
# load training samples
|
48 |
+
if "feature_path" in config and config.feature_path:
|
49 |
+
# load pre-computed features
|
50 |
+
print(f" > Loading features from: {config.feature_path}")
|
51 |
+
eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
|
52 |
+
else:
|
53 |
+
# load data raw wav files
|
54 |
+
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
|
55 |
+
|
56 |
+
# setup audio processor
|
57 |
+
ap = AudioProcessor(**config.audio)
|
58 |
+
|
59 |
+
# init the model from config
|
60 |
+
model = setup_model(config)
|
61 |
+
|
62 |
+
# init the trainer and 🚀
|
63 |
+
trainer = Trainer(
|
64 |
+
train_args,
|
65 |
+
config,
|
66 |
+
config.output_path,
|
67 |
+
model=model,
|
68 |
+
train_samples=train_samples,
|
69 |
+
eval_samples=eval_samples,
|
70 |
+
training_assets={"audio_processor": ap},
|
71 |
+
parse_command_line_args=False,
|
72 |
+
)
|
73 |
+
trainer.fit()
|
74 |
+
|
75 |
+
|
76 |
+
if __name__ == "__main__":
|
77 |
+
main()
|
TTS/bin/tune_wavegrad.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Search a good noise schedule for WaveGrad for a given number of inferece iterations"""
|
2 |
+
import argparse
|
3 |
+
from itertools import product as cartesian_product
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
from torch.utils.data import DataLoader
|
8 |
+
from tqdm import tqdm
|
9 |
+
|
10 |
+
from TTS.utils.audio import AudioProcessor
|
11 |
+
from TTS.utils.io import load_config
|
12 |
+
from TTS.vocoder.datasets.preprocess import load_wav_data
|
13 |
+
from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
|
14 |
+
from TTS.vocoder.utils.generic_utils import setup_generator
|
15 |
+
|
16 |
+
parser = argparse.ArgumentParser()
|
17 |
+
parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
|
18 |
+
parser.add_argument("--config_path", type=str, help="Path to model config file.")
|
19 |
+
parser.add_argument("--data_path", type=str, help="Path to data directory.")
|
20 |
+
parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
|
21 |
+
parser.add_argument(
|
22 |
+
"--num_iter", type=int, help="Number of model inference iterations that you like to optimize noise schedule for."
|
23 |
+
)
|
24 |
+
parser.add_argument("--use_cuda", type=bool, help="enable/disable CUDA.")
|
25 |
+
parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
|
26 |
+
parser.add_argument(
|
27 |
+
"--search_depth",
|
28 |
+
type=int,
|
29 |
+
default=3,
|
30 |
+
help="Search granularity. Increasing this increases the run-time exponentially.",
|
31 |
+
)
|
32 |
+
|
33 |
+
# load config
|
34 |
+
args = parser.parse_args()
|
35 |
+
config = load_config(args.config_path)
|
36 |
+
|
37 |
+
# setup audio processor
|
38 |
+
ap = AudioProcessor(**config.audio)
|
39 |
+
|
40 |
+
# load dataset
|
41 |
+
_, train_data = load_wav_data(args.data_path, 0)
|
42 |
+
train_data = train_data[: args.num_samples]
|
43 |
+
dataset = WaveGradDataset(
|
44 |
+
ap=ap,
|
45 |
+
items=train_data,
|
46 |
+
seq_len=-1,
|
47 |
+
hop_len=ap.hop_length,
|
48 |
+
pad_short=config.pad_short,
|
49 |
+
conv_pad=config.conv_pad,
|
50 |
+
is_training=True,
|
51 |
+
return_segments=False,
|
52 |
+
use_noise_augment=False,
|
53 |
+
use_cache=False,
|
54 |
+
verbose=True,
|
55 |
+
)
|
56 |
+
loader = DataLoader(
|
57 |
+
dataset,
|
58 |
+
batch_size=1,
|
59 |
+
shuffle=False,
|
60 |
+
collate_fn=dataset.collate_full_clips,
|
61 |
+
drop_last=False,
|
62 |
+
num_workers=config.num_loader_workers,
|
63 |
+
pin_memory=False,
|
64 |
+
)
|
65 |
+
|
66 |
+
# setup the model
|
67 |
+
model = setup_generator(config)
|
68 |
+
if args.use_cuda:
|
69 |
+
model.cuda()
|
70 |
+
|
71 |
+
# setup optimization parameters
|
72 |
+
base_values = sorted(10 * np.random.uniform(size=args.search_depth))
|
73 |
+
print(base_values)
|
74 |
+
exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
|
75 |
+
best_error = float("inf")
|
76 |
+
best_schedule = None
|
77 |
+
total_search_iter = len(base_values) ** args.num_iter
|
78 |
+
for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
|
79 |
+
beta = exponents * base
|
80 |
+
model.compute_noise_level(beta)
|
81 |
+
for data in loader:
|
82 |
+
mel, audio = data
|
83 |
+
y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
|
84 |
+
|
85 |
+
if args.use_cuda:
|
86 |
+
y_hat = y_hat.cpu()
|
87 |
+
y_hat = y_hat.numpy()
|
88 |
+
|
89 |
+
mel_hat = []
|
90 |
+
for i in range(y_hat.shape[0]):
|
91 |
+
m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
|
92 |
+
mel_hat.append(torch.from_numpy(m))
|
93 |
+
|
94 |
+
mel_hat = torch.stack(mel_hat)
|
95 |
+
mse = torch.sum((mel - mel_hat) ** 2).mean()
|
96 |
+
if mse.item() < best_error:
|
97 |
+
best_error = mse.item()
|
98 |
+
best_schedule = {"beta": beta}
|
99 |
+
print(f" > Found a better schedule. - MSE: {mse.item()}")
|
100 |
+
np.save(args.output_path, best_schedule)
|
TTS/config/__init__.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
from typing import Dict
|
5 |
+
|
6 |
+
import fsspec
|
7 |
+
import yaml
|
8 |
+
from coqpit import Coqpit
|
9 |
+
|
10 |
+
from TTS.config.shared_configs import *
|
11 |
+
from TTS.utils.generic_utils import find_module
|
12 |
+
|
13 |
+
|
14 |
+
def read_json_with_comments(json_path):
|
15 |
+
"""for backward compat."""
|
16 |
+
# fallback to json
|
17 |
+
with fsspec.open(json_path, "r", encoding="utf-8") as f:
|
18 |
+
input_str = f.read()
|
19 |
+
# handle comments
|
20 |
+
input_str = re.sub(r"\\\n", "", input_str)
|
21 |
+
input_str = re.sub(r"//.*\n", "\n", input_str)
|
22 |
+
data = json.loads(input_str)
|
23 |
+
return data
|
24 |
+
|
25 |
+
|
26 |
+
def register_config(model_name: str) -> Coqpit:
|
27 |
+
"""Find the right config for the given model name.
|
28 |
+
|
29 |
+
Args:
|
30 |
+
model_name (str): Model name.
|
31 |
+
|
32 |
+
Raises:
|
33 |
+
ModuleNotFoundError: No matching config for the model name.
|
34 |
+
|
35 |
+
Returns:
|
36 |
+
Coqpit: config class.
|
37 |
+
"""
|
38 |
+
config_class = None
|
39 |
+
config_name = model_name + "_config"
|
40 |
+
paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs"]
|
41 |
+
for path in paths:
|
42 |
+
try:
|
43 |
+
config_class = find_module(path, config_name)
|
44 |
+
except ModuleNotFoundError:
|
45 |
+
pass
|
46 |
+
if config_class is None:
|
47 |
+
raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
|
48 |
+
return config_class
|
49 |
+
|
50 |
+
|
51 |
+
def _process_model_name(config_dict: Dict) -> str:
|
52 |
+
"""Format the model name as expected. It is a band-aid for the old `vocoder` model names.
|
53 |
+
|
54 |
+
Args:
|
55 |
+
config_dict (Dict): A dictionary including the config fields.
|
56 |
+
|
57 |
+
Returns:
|
58 |
+
str: Formatted modelname.
|
59 |
+
"""
|
60 |
+
model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
|
61 |
+
model_name = model_name.replace("_generator", "").replace("_discriminator", "")
|
62 |
+
return model_name
|
63 |
+
|
64 |
+
|
65 |
+
def load_config(config_path: str) -> None:
|
66 |
+
"""Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
|
67 |
+
to find the corresponding Config class. Then initialize the Config.
|
68 |
+
|
69 |
+
Args:
|
70 |
+
config_path (str): path to the config file.
|
71 |
+
|
72 |
+
Raises:
|
73 |
+
TypeError: given config file has an unknown type.
|
74 |
+
|
75 |
+
Returns:
|
76 |
+
Coqpit: TTS config object.
|
77 |
+
"""
|
78 |
+
config_dict = {}
|
79 |
+
ext = os.path.splitext(config_path)[1]
|
80 |
+
if ext in (".yml", ".yaml"):
|
81 |
+
with fsspec.open(config_path, "r", encoding="utf-8") as f:
|
82 |
+
data = yaml.safe_load(f)
|
83 |
+
elif ext == ".json":
|
84 |
+
try:
|
85 |
+
with fsspec.open(config_path, "r", encoding="utf-8") as f:
|
86 |
+
data = json.load(f)
|
87 |
+
except json.decoder.JSONDecodeError:
|
88 |
+
# backwards compat.
|
89 |
+
data = read_json_with_comments(config_path)
|
90 |
+
else:
|
91 |
+
raise TypeError(f" [!] Unknown config file type {ext}")
|
92 |
+
config_dict.update(data)
|
93 |
+
model_name = _process_model_name(config_dict)
|
94 |
+
config_class = register_config(model_name.lower())
|
95 |
+
config = config_class()
|
96 |
+
config.from_dict(config_dict)
|
97 |
+
return config
|
98 |
+
|
99 |
+
|
100 |
+
def check_config_and_model_args(config, arg_name, value):
|
101 |
+
"""Check the give argument in `config.model_args` if exist or in `config` for
|
102 |
+
the given value.
|
103 |
+
|
104 |
+
Return False if the argument does not exist in `config.model_args` or `config`.
|
105 |
+
This is to patch up the compatibility between models with and without `model_args`.
|
106 |
+
|
107 |
+
TODO: Remove this in the future with a unified approach.
|
108 |
+
"""
|
109 |
+
if hasattr(config, "model_args"):
|
110 |
+
if arg_name in config.model_args:
|
111 |
+
return config.model_args[arg_name] == value
|
112 |
+
if hasattr(config, arg_name):
|
113 |
+
return config[arg_name] == value
|
114 |
+
return False
|
115 |
+
|
116 |
+
|
117 |
+
def get_from_config_or_model_args(config, arg_name):
|
118 |
+
"""Get the given argument from `config.model_args` if exist or in `config`."""
|
119 |
+
if hasattr(config, "model_args"):
|
120 |
+
if arg_name in config.model_args:
|
121 |
+
return config.model_args[arg_name]
|
122 |
+
return config[arg_name]
|
123 |
+
|
124 |
+
|
125 |
+
def get_from_config_or_model_args_with_default(config, arg_name, def_val):
|
126 |
+
"""Get the given argument from `config.model_args` if exist or in `config`."""
|
127 |
+
if hasattr(config, "model_args"):
|
128 |
+
if arg_name in config.model_args:
|
129 |
+
return config.model_args[arg_name]
|
130 |
+
if hasattr(config, arg_name):
|
131 |
+
return config[arg_name]
|
132 |
+
return def_val
|
TTS/config/shared_configs.py
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import asdict, dataclass
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
from coqpit import Coqpit, check_argument
|
5 |
+
from trainer import TrainerConfig
|
6 |
+
|
7 |
+
|
8 |
+
@dataclass
|
9 |
+
class BaseAudioConfig(Coqpit):
|
10 |
+
"""Base config to definge audio processing parameters. It is used to initialize
|
11 |
+
```TTS.utils.audio.AudioProcessor.```
|
12 |
+
|
13 |
+
Args:
|
14 |
+
fft_size (int):
|
15 |
+
Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
|
16 |
+
|
17 |
+
win_length (int):
|
18 |
+
Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
|
19 |
+
```fft_size```. Defaults to 1024.
|
20 |
+
|
21 |
+
hop_length (int):
|
22 |
+
Number of audio samples between adjacent STFT columns. Defaults to 1024.
|
23 |
+
|
24 |
+
frame_shift_ms (int):
|
25 |
+
Set ```hop_length``` based on milliseconds and sampling rate.
|
26 |
+
|
27 |
+
frame_length_ms (int):
|
28 |
+
Set ```win_length``` based on milliseconds and sampling rate.
|
29 |
+
|
30 |
+
stft_pad_mode (str):
|
31 |
+
Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
|
32 |
+
|
33 |
+
sample_rate (int):
|
34 |
+
Audio sampling rate. Defaults to 22050.
|
35 |
+
|
36 |
+
resample (bool):
|
37 |
+
Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
|
38 |
+
|
39 |
+
preemphasis (float):
|
40 |
+
Preemphasis coefficient. Defaults to 0.0.
|
41 |
+
|
42 |
+
ref_level_db (int): 20
|
43 |
+
Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
|
44 |
+
Defaults to 20.
|
45 |
+
|
46 |
+
do_sound_norm (bool):
|
47 |
+
Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
|
48 |
+
|
49 |
+
log_func (str):
|
50 |
+
Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
|
51 |
+
|
52 |
+
do_trim_silence (bool):
|
53 |
+
Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
|
54 |
+
|
55 |
+
do_amp_to_db_linear (bool, optional):
|
56 |
+
enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
|
57 |
+
|
58 |
+
do_amp_to_db_mel (bool, optional):
|
59 |
+
enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
|
60 |
+
|
61 |
+
pitch_fmax (float, optional):
|
62 |
+
Maximum frequency of the F0 frames. Defaults to ```640```.
|
63 |
+
|
64 |
+
pitch_fmin (float, optional):
|
65 |
+
Minimum frequency of the F0 frames. Defaults to ```0```.
|
66 |
+
|
67 |
+
trim_db (int):
|
68 |
+
Silence threshold used for silence trimming. Defaults to 45.
|
69 |
+
|
70 |
+
do_rms_norm (bool, optional):
|
71 |
+
enable/disable RMS volume normalization when loading an audio file. Defaults to False.
|
72 |
+
|
73 |
+
db_level (int, optional):
|
74 |
+
dB level used for rms normalization. The range is -99 to 0. Defaults to None.
|
75 |
+
|
76 |
+
power (float):
|
77 |
+
Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
|
78 |
+
artifacts in the synthesized voice. Defaults to 1.5.
|
79 |
+
|
80 |
+
griffin_lim_iters (int):
|
81 |
+
Number of Griffing Lim iterations. Defaults to 60.
|
82 |
+
|
83 |
+
num_mels (int):
|
84 |
+
Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
|
85 |
+
|
86 |
+
mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
|
87 |
+
It needs to be adjusted for a dataset. Defaults to 0.
|
88 |
+
|
89 |
+
mel_fmax (float):
|
90 |
+
Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
|
91 |
+
|
92 |
+
spec_gain (int):
|
93 |
+
Gain applied when converting amplitude to DB. Defaults to 20.
|
94 |
+
|
95 |
+
signal_norm (bool):
|
96 |
+
enable/disable signal normalization. Defaults to True.
|
97 |
+
|
98 |
+
min_level_db (int):
|
99 |
+
minimum db threshold for the computed melspectrograms. Defaults to -100.
|
100 |
+
|
101 |
+
symmetric_norm (bool):
|
102 |
+
enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
|
103 |
+
[0, k], Defaults to True.
|
104 |
+
|
105 |
+
max_norm (float):
|
106 |
+
```k``` defining the normalization range. Defaults to 4.0.
|
107 |
+
|
108 |
+
clip_norm (bool):
|
109 |
+
enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
|
110 |
+
|
111 |
+
stats_path (str):
|
112 |
+
Path to the computed stats file. Defaults to None.
|
113 |
+
"""
|
114 |
+
|
115 |
+
# stft parameters
|
116 |
+
fft_size: int = 1024
|
117 |
+
win_length: int = 1024
|
118 |
+
hop_length: int = 256
|
119 |
+
frame_shift_ms: int = None
|
120 |
+
frame_length_ms: int = None
|
121 |
+
stft_pad_mode: str = "reflect"
|
122 |
+
# audio processing parameters
|
123 |
+
sample_rate: int = 22050
|
124 |
+
resample: bool = False
|
125 |
+
preemphasis: float = 0.0
|
126 |
+
ref_level_db: int = 20
|
127 |
+
do_sound_norm: bool = False
|
128 |
+
log_func: str = "np.log10"
|
129 |
+
# silence trimming
|
130 |
+
do_trim_silence: bool = True
|
131 |
+
trim_db: int = 45
|
132 |
+
# rms volume normalization
|
133 |
+
do_rms_norm: bool = False
|
134 |
+
db_level: float = None
|
135 |
+
# griffin-lim params
|
136 |
+
power: float = 1.5
|
137 |
+
griffin_lim_iters: int = 60
|
138 |
+
# mel-spec params
|
139 |
+
num_mels: int = 80
|
140 |
+
mel_fmin: float = 0.0
|
141 |
+
mel_fmax: float = None
|
142 |
+
spec_gain: int = 20
|
143 |
+
do_amp_to_db_linear: bool = True
|
144 |
+
do_amp_to_db_mel: bool = True
|
145 |
+
# f0 params
|
146 |
+
pitch_fmax: float = 640.0
|
147 |
+
pitch_fmin: float = 0.0
|
148 |
+
# normalization params
|
149 |
+
signal_norm: bool = True
|
150 |
+
min_level_db: int = -100
|
151 |
+
symmetric_norm: bool = True
|
152 |
+
max_norm: float = 4.0
|
153 |
+
clip_norm: bool = True
|
154 |
+
stats_path: str = None
|
155 |
+
|
156 |
+
def check_values(
|
157 |
+
self,
|
158 |
+
):
|
159 |
+
"""Check config fields"""
|
160 |
+
c = asdict(self)
|
161 |
+
check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
|
162 |
+
check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
|
163 |
+
check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
|
164 |
+
check_argument(
|
165 |
+
"frame_length_ms",
|
166 |
+
c,
|
167 |
+
restricted=True,
|
168 |
+
min_val=10,
|
169 |
+
max_val=1000,
|
170 |
+
alternative="win_length",
|
171 |
+
)
|
172 |
+
check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
|
173 |
+
check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
|
174 |
+
check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
|
175 |
+
check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
|
176 |
+
check_argument("power", c, restricted=True, min_val=1, max_val=5)
|
177 |
+
check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
|
178 |
+
|
179 |
+
# normalization parameters
|
180 |
+
check_argument("signal_norm", c, restricted=True)
|
181 |
+
check_argument("symmetric_norm", c, restricted=True)
|
182 |
+
check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
|
183 |
+
check_argument("clip_norm", c, restricted=True)
|
184 |
+
check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
|
185 |
+
check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
|
186 |
+
check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
|
187 |
+
check_argument("do_trim_silence", c, restricted=True)
|
188 |
+
check_argument("trim_db", c, restricted=True)
|
189 |
+
|
190 |
+
|
191 |
+
@dataclass
|
192 |
+
class BaseDatasetConfig(Coqpit):
|
193 |
+
"""Base config for TTS datasets.
|
194 |
+
|
195 |
+
Args:
|
196 |
+
name (str):
|
197 |
+
Dataset name that defines the preprocessor in use. Defaults to None.
|
198 |
+
|
199 |
+
path (str):
|
200 |
+
Root path to the dataset files. Defaults to None.
|
201 |
+
|
202 |
+
meta_file_train (str):
|
203 |
+
Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
|
204 |
+
Defaults to None.
|
205 |
+
|
206 |
+
ignored_speakers (List):
|
207 |
+
List of speakers IDs that are not used at the training. Default None.
|
208 |
+
|
209 |
+
language (str):
|
210 |
+
Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to None.
|
211 |
+
|
212 |
+
meta_file_val (str):
|
213 |
+
Name of the dataset meta file that defines the instances used at validation.
|
214 |
+
|
215 |
+
meta_file_attn_mask (str):
|
216 |
+
Path to the file that lists the attention mask files used with models that require attention masks to
|
217 |
+
train the duration predictor.
|
218 |
+
"""
|
219 |
+
|
220 |
+
name: str = ""
|
221 |
+
path: str = ""
|
222 |
+
meta_file_train: str = ""
|
223 |
+
ignored_speakers: List[str] = None
|
224 |
+
language: str = ""
|
225 |
+
meta_file_val: str = ""
|
226 |
+
meta_file_attn_mask: str = ""
|
227 |
+
|
228 |
+
def check_values(
|
229 |
+
self,
|
230 |
+
):
|
231 |
+
"""Check config fields"""
|
232 |
+
c = asdict(self)
|
233 |
+
check_argument("name", c, restricted=True)
|
234 |
+
check_argument("path", c, restricted=True)
|
235 |
+
check_argument("meta_file_train", c, restricted=True)
|
236 |
+
check_argument("meta_file_val", c, restricted=False)
|
237 |
+
check_argument("meta_file_attn_mask", c, restricted=False)
|
238 |
+
|
239 |
+
|
240 |
+
@dataclass
|
241 |
+
class BaseTrainingConfig(TrainerConfig):
|
242 |
+
"""Base config to define the basic 🐸TTS training parameters that are shared
|
243 |
+
among all the models. It is based on ```Trainer.TrainingConfig```.
|
244 |
+
|
245 |
+
Args:
|
246 |
+
model (str):
|
247 |
+
Name of the model that is used in the training.
|
248 |
+
|
249 |
+
num_loader_workers (int):
|
250 |
+
Number of workers for training time dataloader.
|
251 |
+
|
252 |
+
num_eval_loader_workers (int):
|
253 |
+
Number of workers for evaluation time dataloader.
|
254 |
+
"""
|
255 |
+
|
256 |
+
model: str = None
|
257 |
+
# dataloading
|
258 |
+
num_loader_workers: int = 0
|
259 |
+
num_eval_loader_workers: int = 0
|
260 |
+
use_noise_augment: bool = False
|
TTS/encoder/README.md
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Speaker Encoder
|
2 |
+
|
3 |
+
This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
|
4 |
+
|
5 |
+
With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
|
6 |
+
|
7 |
+
Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
|
8 |
+
|
9 |
+

|
10 |
+
|
11 |
+
Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
|
12 |
+
|
13 |
+
To run the code, you need to follow the same flow as in TTS.
|
14 |
+
|
15 |
+
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
|
16 |
+
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
|
17 |
+
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
|
18 |
+
- Watch training on Tensorboard as in TTS
|
TTS/encoder/__init__.py
ADDED
File without changes
|
TTS/encoder/configs/base_encoder_config.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import asdict, dataclass, field
|
2 |
+
from typing import Dict, List
|
3 |
+
|
4 |
+
from coqpit import MISSING
|
5 |
+
|
6 |
+
from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
|
7 |
+
|
8 |
+
|
9 |
+
@dataclass
|
10 |
+
class BaseEncoderConfig(BaseTrainingConfig):
|
11 |
+
"""Defines parameters for a Generic Encoder model."""
|
12 |
+
|
13 |
+
model: str = None
|
14 |
+
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
|
15 |
+
datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
|
16 |
+
# model params
|
17 |
+
model_params: Dict = field(
|
18 |
+
default_factory=lambda: {
|
19 |
+
"model_name": "lstm",
|
20 |
+
"input_dim": 80,
|
21 |
+
"proj_dim": 256,
|
22 |
+
"lstm_dim": 768,
|
23 |
+
"num_lstm_layers": 3,
|
24 |
+
"use_lstm_with_projection": True,
|
25 |
+
}
|
26 |
+
)
|
27 |
+
|
28 |
+
audio_augmentation: Dict = field(default_factory=lambda: {})
|
29 |
+
|
30 |
+
# training params
|
31 |
+
epochs: int = 10000
|
32 |
+
loss: str = "angleproto"
|
33 |
+
grad_clip: float = 3.0
|
34 |
+
lr: float = 0.0001
|
35 |
+
optimizer: str = "radam"
|
36 |
+
optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
|
37 |
+
lr_decay: bool = False
|
38 |
+
warmup_steps: int = 4000
|
39 |
+
|
40 |
+
# logging params
|
41 |
+
tb_model_param_stats: bool = False
|
42 |
+
steps_plot_stats: int = 10
|
43 |
+
save_step: int = 1000
|
44 |
+
print_step: int = 20
|
45 |
+
run_eval: bool = False
|
46 |
+
|
47 |
+
# data loader
|
48 |
+
num_classes_in_batch: int = MISSING
|
49 |
+
num_utter_per_class: int = MISSING
|
50 |
+
eval_num_classes_in_batch: int = None
|
51 |
+
eval_num_utter_per_class: int = None
|
52 |
+
|
53 |
+
num_loader_workers: int = MISSING
|
54 |
+
voice_len: float = 1.6
|
55 |
+
|
56 |
+
def check_values(self):
|
57 |
+
super().check_values()
|
58 |
+
c = asdict(self)
|
59 |
+
assert (
|
60 |
+
c["model_params"]["input_dim"] == self.audio.num_mels
|
61 |
+
), " [!] model input dimendion must be equal to melspectrogram dimension."
|
TTS/encoder/configs/emotion_encoder_config.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import asdict, dataclass
|
2 |
+
|
3 |
+
from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
|
4 |
+
|
5 |
+
|
6 |
+
@dataclass
|
7 |
+
class EmotionEncoderConfig(BaseEncoderConfig):
|
8 |
+
"""Defines parameters for Emotion Encoder model."""
|
9 |
+
|
10 |
+
model: str = "emotion_encoder"
|
11 |
+
map_classid_to_classname: dict = None
|
12 |
+
class_name_key: str = "emotion_name"
|
TTS/encoder/configs/speaker_encoder_config.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import asdict, dataclass
|
2 |
+
|
3 |
+
from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
|
4 |
+
|
5 |
+
|
6 |
+
@dataclass
|
7 |
+
class SpeakerEncoderConfig(BaseEncoderConfig):
|
8 |
+
"""Defines parameters for Speaker Encoder model."""
|
9 |
+
|
10 |
+
model: str = "speaker_encoder"
|
11 |
+
class_name_key: str = "speaker_name"
|
TTS/encoder/dataset.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch.utils.data import Dataset
|
5 |
+
|
6 |
+
from TTS.encoder.utils.generic_utils import AugmentWAV
|
7 |
+
|
8 |
+
|
9 |
+
class EncoderDataset(Dataset):
|
10 |
+
def __init__(
|
11 |
+
self,
|
12 |
+
config,
|
13 |
+
ap,
|
14 |
+
meta_data,
|
15 |
+
voice_len=1.6,
|
16 |
+
num_classes_in_batch=64,
|
17 |
+
num_utter_per_class=10,
|
18 |
+
verbose=False,
|
19 |
+
augmentation_config=None,
|
20 |
+
use_torch_spec=None,
|
21 |
+
):
|
22 |
+
"""
|
23 |
+
Args:
|
24 |
+
ap (TTS.tts.utils.AudioProcessor): audio processor object.
|
25 |
+
meta_data (list): list of dataset instances.
|
26 |
+
seq_len (int): voice segment length in seconds.
|
27 |
+
verbose (bool): print diagnostic information.
|
28 |
+
"""
|
29 |
+
super().__init__()
|
30 |
+
self.config = config
|
31 |
+
self.items = meta_data
|
32 |
+
self.sample_rate = ap.sample_rate
|
33 |
+
self.seq_len = int(voice_len * self.sample_rate)
|
34 |
+
self.num_utter_per_class = num_utter_per_class
|
35 |
+
self.ap = ap
|
36 |
+
self.verbose = verbose
|
37 |
+
self.use_torch_spec = use_torch_spec
|
38 |
+
self.classes, self.items = self.__parse_items()
|
39 |
+
|
40 |
+
self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
|
41 |
+
|
42 |
+
# Data Augmentation
|
43 |
+
self.augmentator = None
|
44 |
+
self.gaussian_augmentation_config = None
|
45 |
+
if augmentation_config:
|
46 |
+
self.data_augmentation_p = augmentation_config["p"]
|
47 |
+
if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
|
48 |
+
self.augmentator = AugmentWAV(ap, augmentation_config)
|
49 |
+
|
50 |
+
if "gaussian" in augmentation_config.keys():
|
51 |
+
self.gaussian_augmentation_config = augmentation_config["gaussian"]
|
52 |
+
|
53 |
+
if self.verbose:
|
54 |
+
print("\n > DataLoader initialization")
|
55 |
+
print(f" | > Classes per Batch: {num_classes_in_batch}")
|
56 |
+
print(f" | > Number of instances : {len(self.items)}")
|
57 |
+
print(f" | > Sequence length: {self.seq_len}")
|
58 |
+
print(f" | > Num Classes: {len(self.classes)}")
|
59 |
+
print(f" | > Classes: {self.classes}")
|
60 |
+
|
61 |
+
def load_wav(self, filename):
|
62 |
+
audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
|
63 |
+
return audio
|
64 |
+
|
65 |
+
def __parse_items(self):
|
66 |
+
class_to_utters = {}
|
67 |
+
for item in self.items:
|
68 |
+
path_ = item["audio_file"]
|
69 |
+
class_name = item[self.config.class_name_key]
|
70 |
+
if class_name in class_to_utters.keys():
|
71 |
+
class_to_utters[class_name].append(path_)
|
72 |
+
else:
|
73 |
+
class_to_utters[class_name] = [
|
74 |
+
path_,
|
75 |
+
]
|
76 |
+
|
77 |
+
# skip classes with number of samples >= self.num_utter_per_class
|
78 |
+
class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
|
79 |
+
|
80 |
+
classes = list(class_to_utters.keys())
|
81 |
+
classes.sort()
|
82 |
+
|
83 |
+
new_items = []
|
84 |
+
for item in self.items:
|
85 |
+
path_ = item["audio_file"]
|
86 |
+
class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
|
87 |
+
# ignore filtered classes
|
88 |
+
if class_name not in classes:
|
89 |
+
continue
|
90 |
+
# ignore small audios
|
91 |
+
if self.load_wav(path_).shape[0] - self.seq_len <= 0:
|
92 |
+
continue
|
93 |
+
|
94 |
+
new_items.append({"wav_file_path": path_, "class_name": class_name})
|
95 |
+
|
96 |
+
return classes, new_items
|
97 |
+
|
98 |
+
def __len__(self):
|
99 |
+
return len(self.items)
|
100 |
+
|
101 |
+
def get_num_classes(self):
|
102 |
+
return len(self.classes)
|
103 |
+
|
104 |
+
def get_class_list(self):
|
105 |
+
return self.classes
|
106 |
+
|
107 |
+
def set_classes(self, classes):
|
108 |
+
self.classes = classes
|
109 |
+
self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
|
110 |
+
|
111 |
+
def get_map_classid_to_classname(self):
|
112 |
+
return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
|
113 |
+
|
114 |
+
def __getitem__(self, idx):
|
115 |
+
return self.items[idx]
|
116 |
+
|
117 |
+
def collate_fn(self, batch):
|
118 |
+
# get the batch class_ids
|
119 |
+
labels = []
|
120 |
+
feats = []
|
121 |
+
for item in batch:
|
122 |
+
utter_path = item["wav_file_path"]
|
123 |
+
class_name = item["class_name"]
|
124 |
+
|
125 |
+
# get classid
|
126 |
+
class_id = self.classname_to_classid[class_name]
|
127 |
+
# load wav file
|
128 |
+
wav = self.load_wav(utter_path)
|
129 |
+
offset = random.randint(0, wav.shape[0] - self.seq_len)
|
130 |
+
wav = wav[offset : offset + self.seq_len]
|
131 |
+
|
132 |
+
if self.augmentator is not None and self.data_augmentation_p:
|
133 |
+
if random.random() < self.data_augmentation_p:
|
134 |
+
wav = self.augmentator.apply_one(wav)
|
135 |
+
|
136 |
+
if not self.use_torch_spec:
|
137 |
+
mel = self.ap.melspectrogram(wav)
|
138 |
+
feats.append(torch.FloatTensor(mel))
|
139 |
+
else:
|
140 |
+
feats.append(torch.FloatTensor(wav))
|
141 |
+
|
142 |
+
labels.append(class_id)
|
143 |
+
|
144 |
+
feats = torch.stack(feats)
|
145 |
+
labels = torch.LongTensor(labels)
|
146 |
+
|
147 |
+
return feats, labels
|
TTS/encoder/losses.py
ADDED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
from torch import nn
|
4 |
+
|
5 |
+
|
6 |
+
# adapted from https://github.com/cvqluu/GE2E-Loss
|
7 |
+
class GE2ELoss(nn.Module):
|
8 |
+
def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
|
9 |
+
"""
|
10 |
+
Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
|
11 |
+
Accepts an input of size (N, M, D)
|
12 |
+
where N is the number of speakers in the batch,
|
13 |
+
M is the number of utterances per speaker,
|
14 |
+
and D is the dimensionality of the embedding vector (e.g. d-vector)
|
15 |
+
Args:
|
16 |
+
- init_w (float): defines the initial value of w in Equation (5) of [1]
|
17 |
+
- init_b (float): definies the initial value of b in Equation (5) of [1]
|
18 |
+
"""
|
19 |
+
super().__init__()
|
20 |
+
# pylint: disable=E1102
|
21 |
+
self.w = nn.Parameter(torch.tensor(init_w))
|
22 |
+
# pylint: disable=E1102
|
23 |
+
self.b = nn.Parameter(torch.tensor(init_b))
|
24 |
+
self.loss_method = loss_method
|
25 |
+
|
26 |
+
print(" > Initialized Generalized End-to-End loss")
|
27 |
+
|
28 |
+
assert self.loss_method in ["softmax", "contrast"]
|
29 |
+
|
30 |
+
if self.loss_method == "softmax":
|
31 |
+
self.embed_loss = self.embed_loss_softmax
|
32 |
+
if self.loss_method == "contrast":
|
33 |
+
self.embed_loss = self.embed_loss_contrast
|
34 |
+
|
35 |
+
# pylint: disable=R0201
|
36 |
+
def calc_new_centroids(self, dvecs, centroids, spkr, utt):
|
37 |
+
"""
|
38 |
+
Calculates the new centroids excluding the reference utterance
|
39 |
+
"""
|
40 |
+
excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
|
41 |
+
excl = torch.mean(excl, 0)
|
42 |
+
new_centroids = []
|
43 |
+
for i, centroid in enumerate(centroids):
|
44 |
+
if i == spkr:
|
45 |
+
new_centroids.append(excl)
|
46 |
+
else:
|
47 |
+
new_centroids.append(centroid)
|
48 |
+
return torch.stack(new_centroids)
|
49 |
+
|
50 |
+
def calc_cosine_sim(self, dvecs, centroids):
|
51 |
+
"""
|
52 |
+
Make the cosine similarity matrix with dims (N,M,N)
|
53 |
+
"""
|
54 |
+
cos_sim_matrix = []
|
55 |
+
for spkr_idx, speaker in enumerate(dvecs):
|
56 |
+
cs_row = []
|
57 |
+
for utt_idx, utterance in enumerate(speaker):
|
58 |
+
new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
|
59 |
+
# vector based cosine similarity for speed
|
60 |
+
cs_row.append(
|
61 |
+
torch.clamp(
|
62 |
+
torch.mm(
|
63 |
+
utterance.unsqueeze(1).transpose(0, 1),
|
64 |
+
new_centroids.transpose(0, 1),
|
65 |
+
)
|
66 |
+
/ (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
|
67 |
+
1e-6,
|
68 |
+
)
|
69 |
+
)
|
70 |
+
cs_row = torch.cat(cs_row, dim=0)
|
71 |
+
cos_sim_matrix.append(cs_row)
|
72 |
+
return torch.stack(cos_sim_matrix)
|
73 |
+
|
74 |
+
# pylint: disable=R0201
|
75 |
+
def embed_loss_softmax(self, dvecs, cos_sim_matrix):
|
76 |
+
"""
|
77 |
+
Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
|
78 |
+
"""
|
79 |
+
N, M, _ = dvecs.shape
|
80 |
+
L = []
|
81 |
+
for j in range(N):
|
82 |
+
L_row = []
|
83 |
+
for i in range(M):
|
84 |
+
L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
|
85 |
+
L_row = torch.stack(L_row)
|
86 |
+
L.append(L_row)
|
87 |
+
return torch.stack(L)
|
88 |
+
|
89 |
+
# pylint: disable=R0201
|
90 |
+
def embed_loss_contrast(self, dvecs, cos_sim_matrix):
|
91 |
+
"""
|
92 |
+
Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
|
93 |
+
"""
|
94 |
+
N, M, _ = dvecs.shape
|
95 |
+
L = []
|
96 |
+
for j in range(N):
|
97 |
+
L_row = []
|
98 |
+
for i in range(M):
|
99 |
+
centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
|
100 |
+
excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]))
|
101 |
+
L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
|
102 |
+
L_row = torch.stack(L_row)
|
103 |
+
L.append(L_row)
|
104 |
+
return torch.stack(L)
|
105 |
+
|
106 |
+
def forward(self, x, _label=None):
|
107 |
+
"""
|
108 |
+
Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
|
109 |
+
"""
|
110 |
+
|
111 |
+
assert x.size()[1] >= 2
|
112 |
+
|
113 |
+
centroids = torch.mean(x, 1)
|
114 |
+
cos_sim_matrix = self.calc_cosine_sim(x, centroids)
|
115 |
+
torch.clamp(self.w, 1e-6)
|
116 |
+
cos_sim_matrix = self.w * cos_sim_matrix + self.b
|
117 |
+
L = self.embed_loss(x, cos_sim_matrix)
|
118 |
+
return L.mean()
|
119 |
+
|
120 |
+
|
121 |
+
# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
|
122 |
+
class AngleProtoLoss(nn.Module):
|
123 |
+
"""
|
124 |
+
Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
|
125 |
+
Accepts an input of size (N, M, D)
|
126 |
+
where N is the number of speakers in the batch,
|
127 |
+
M is the number of utterances per speaker,
|
128 |
+
and D is the dimensionality of the embedding vector
|
129 |
+
Args:
|
130 |
+
- init_w (float): defines the initial value of w
|
131 |
+
- init_b (float): definies the initial value of b
|
132 |
+
"""
|
133 |
+
|
134 |
+
def __init__(self, init_w=10.0, init_b=-5.0):
|
135 |
+
super().__init__()
|
136 |
+
# pylint: disable=E1102
|
137 |
+
self.w = nn.Parameter(torch.tensor(init_w))
|
138 |
+
# pylint: disable=E1102
|
139 |
+
self.b = nn.Parameter(torch.tensor(init_b))
|
140 |
+
self.criterion = torch.nn.CrossEntropyLoss()
|
141 |
+
|
142 |
+
print(" > Initialized Angular Prototypical loss")
|
143 |
+
|
144 |
+
def forward(self, x, _label=None):
|
145 |
+
"""
|
146 |
+
Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
|
147 |
+
"""
|
148 |
+
|
149 |
+
assert x.size()[1] >= 2
|
150 |
+
|
151 |
+
out_anchor = torch.mean(x[:, 1:, :], 1)
|
152 |
+
out_positive = x[:, 0, :]
|
153 |
+
num_speakers = out_anchor.size()[0]
|
154 |
+
|
155 |
+
cos_sim_matrix = F.cosine_similarity(
|
156 |
+
out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),
|
157 |
+
out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2),
|
158 |
+
)
|
159 |
+
torch.clamp(self.w, 1e-6)
|
160 |
+
cos_sim_matrix = cos_sim_matrix * self.w + self.b
|
161 |
+
label = torch.arange(num_speakers).to(cos_sim_matrix.device)
|
162 |
+
L = self.criterion(cos_sim_matrix, label)
|
163 |
+
return L
|
164 |
+
|
165 |
+
|
166 |
+
class SoftmaxLoss(nn.Module):
|
167 |
+
"""
|
168 |
+
Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
|
169 |
+
Args:
|
170 |
+
- embedding_dim (float): speaker embedding dim
|
171 |
+
- n_speakers (float): number of speakers
|
172 |
+
"""
|
173 |
+
|
174 |
+
def __init__(self, embedding_dim, n_speakers):
|
175 |
+
super().__init__()
|
176 |
+
|
177 |
+
self.criterion = torch.nn.CrossEntropyLoss()
|
178 |
+
self.fc = nn.Linear(embedding_dim, n_speakers)
|
179 |
+
|
180 |
+
print("Initialised Softmax Loss")
|
181 |
+
|
182 |
+
def forward(self, x, label=None):
|
183 |
+
# reshape for compatibility
|
184 |
+
x = x.reshape(-1, x.size()[-1])
|
185 |
+
label = label.reshape(-1)
|
186 |
+
|
187 |
+
x = self.fc(x)
|
188 |
+
L = self.criterion(x, label)
|
189 |
+
|
190 |
+
return L
|
191 |
+
|
192 |
+
def inference(self, embedding):
|
193 |
+
x = self.fc(embedding)
|
194 |
+
activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
|
195 |
+
class_id = torch.argmax(activations)
|
196 |
+
return class_id
|
197 |
+
|
198 |
+
|
199 |
+
class SoftmaxAngleProtoLoss(nn.Module):
|
200 |
+
"""
|
201 |
+
Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
|
202 |
+
Args:
|
203 |
+
- embedding_dim (float): speaker embedding dim
|
204 |
+
- n_speakers (float): number of speakers
|
205 |
+
- init_w (float): defines the initial value of w
|
206 |
+
- init_b (float): definies the initial value of b
|
207 |
+
"""
|
208 |
+
|
209 |
+
def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
|
210 |
+
super().__init__()
|
211 |
+
|
212 |
+
self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
|
213 |
+
self.angleproto = AngleProtoLoss(init_w, init_b)
|
214 |
+
|
215 |
+
print("Initialised SoftmaxAnglePrototypical Loss")
|
216 |
+
|
217 |
+
def forward(self, x, label=None):
|
218 |
+
"""
|
219 |
+
Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
|
220 |
+
"""
|
221 |
+
|
222 |
+
Lp = self.angleproto(x)
|
223 |
+
|
224 |
+
Ls = self.softmax(x, label)
|
225 |
+
|
226 |
+
return Ls + Lp
|
TTS/encoder/models/base_encoder.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import torchaudio
|
4 |
+
from coqpit import Coqpit
|
5 |
+
from torch import nn
|
6 |
+
|
7 |
+
from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
8 |
+
from TTS.utils.generic_utils import set_init_dict
|
9 |
+
from TTS.utils.io import load_fsspec
|
10 |
+
|
11 |
+
|
12 |
+
class PreEmphasis(nn.Module):
|
13 |
+
def __init__(self, coefficient=0.97):
|
14 |
+
super().__init__()
|
15 |
+
self.coefficient = coefficient
|
16 |
+
self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
|
17 |
+
|
18 |
+
def forward(self, x):
|
19 |
+
assert len(x.size()) == 2
|
20 |
+
|
21 |
+
x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
|
22 |
+
return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
|
23 |
+
|
24 |
+
|
25 |
+
class BaseEncoder(nn.Module):
|
26 |
+
"""Base `encoder` class. Every new `encoder` model must inherit this.
|
27 |
+
|
28 |
+
It defines common `encoder` specific functions.
|
29 |
+
"""
|
30 |
+
|
31 |
+
# pylint: disable=W0102
|
32 |
+
def __init__(self):
|
33 |
+
super(BaseEncoder, self).__init__()
|
34 |
+
|
35 |
+
def get_torch_mel_spectrogram_class(self, audio_config):
|
36 |
+
return torch.nn.Sequential(
|
37 |
+
PreEmphasis(audio_config["preemphasis"]),
|
38 |
+
# TorchSTFT(
|
39 |
+
# n_fft=audio_config["fft_size"],
|
40 |
+
# hop_length=audio_config["hop_length"],
|
41 |
+
# win_length=audio_config["win_length"],
|
42 |
+
# sample_rate=audio_config["sample_rate"],
|
43 |
+
# window="hamming_window",
|
44 |
+
# mel_fmin=0.0,
|
45 |
+
# mel_fmax=None,
|
46 |
+
# use_htk=True,
|
47 |
+
# do_amp_to_db=False,
|
48 |
+
# n_mels=audio_config["num_mels"],
|
49 |
+
# power=2.0,
|
50 |
+
# use_mel=True,
|
51 |
+
# mel_norm=None,
|
52 |
+
# )
|
53 |
+
torchaudio.transforms.MelSpectrogram(
|
54 |
+
sample_rate=audio_config["sample_rate"],
|
55 |
+
n_fft=audio_config["fft_size"],
|
56 |
+
win_length=audio_config["win_length"],
|
57 |
+
hop_length=audio_config["hop_length"],
|
58 |
+
window_fn=torch.hamming_window,
|
59 |
+
n_mels=audio_config["num_mels"],
|
60 |
+
),
|
61 |
+
)
|
62 |
+
|
63 |
+
@torch.no_grad()
|
64 |
+
def inference(self, x, l2_norm=True):
|
65 |
+
return self.forward(x, l2_norm)
|
66 |
+
|
67 |
+
@torch.no_grad()
|
68 |
+
def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
|
69 |
+
"""
|
70 |
+
Generate embeddings for a batch of utterances
|
71 |
+
x: 1xTxD
|
72 |
+
"""
|
73 |
+
# map to the waveform size
|
74 |
+
if self.use_torch_spec:
|
75 |
+
num_frames = num_frames * self.audio_config["hop_length"]
|
76 |
+
|
77 |
+
max_len = x.shape[1]
|
78 |
+
|
79 |
+
if max_len < num_frames:
|
80 |
+
num_frames = max_len
|
81 |
+
|
82 |
+
offsets = np.linspace(0, max_len - num_frames, num=num_eval)
|
83 |
+
|
84 |
+
frames_batch = []
|
85 |
+
for offset in offsets:
|
86 |
+
offset = int(offset)
|
87 |
+
end_offset = int(offset + num_frames)
|
88 |
+
frames = x[:, offset:end_offset]
|
89 |
+
frames_batch.append(frames)
|
90 |
+
|
91 |
+
frames_batch = torch.cat(frames_batch, dim=0)
|
92 |
+
embeddings = self.inference(frames_batch, l2_norm=l2_norm)
|
93 |
+
|
94 |
+
if return_mean:
|
95 |
+
embeddings = torch.mean(embeddings, dim=0, keepdim=True)
|
96 |
+
return embeddings
|
97 |
+
|
98 |
+
def get_criterion(self, c: Coqpit, num_classes=None):
|
99 |
+
if c.loss == "ge2e":
|
100 |
+
criterion = GE2ELoss(loss_method="softmax")
|
101 |
+
elif c.loss == "angleproto":
|
102 |
+
criterion = AngleProtoLoss()
|
103 |
+
elif c.loss == "softmaxproto":
|
104 |
+
criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
|
105 |
+
else:
|
106 |
+
raise Exception("The %s not is a loss supported" % c.loss)
|
107 |
+
return criterion
|
108 |
+
|
109 |
+
def load_checkpoint(
|
110 |
+
self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None
|
111 |
+
):
|
112 |
+
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
113 |
+
try:
|
114 |
+
self.load_state_dict(state["model"])
|
115 |
+
except (KeyError, RuntimeError) as error:
|
116 |
+
# If eval raise the error
|
117 |
+
if eval:
|
118 |
+
raise error
|
119 |
+
|
120 |
+
print(" > Partial model initialization.")
|
121 |
+
model_dict = self.state_dict()
|
122 |
+
model_dict = set_init_dict(model_dict, state["model"], c)
|
123 |
+
self.load_state_dict(model_dict)
|
124 |
+
del model_dict
|
125 |
+
|
126 |
+
# load the criterion for restore_path
|
127 |
+
if criterion is not None and "criterion" in state:
|
128 |
+
try:
|
129 |
+
criterion.load_state_dict(state["criterion"])
|
130 |
+
except (KeyError, RuntimeError) as error:
|
131 |
+
print(" > Criterion load ignored because of:", error)
|
132 |
+
|
133 |
+
# instance and load the criterion for the encoder classifier in inference time
|
134 |
+
if (
|
135 |
+
eval
|
136 |
+
and criterion is None
|
137 |
+
and "criterion" in state
|
138 |
+
and getattr(config, "map_classid_to_classname", None) is not None
|
139 |
+
):
|
140 |
+
criterion = self.get_criterion(config, len(config.map_classid_to_classname))
|
141 |
+
criterion.load_state_dict(state["criterion"])
|
142 |
+
|
143 |
+
if use_cuda:
|
144 |
+
self.cuda()
|
145 |
+
if criterion is not None:
|
146 |
+
criterion = criterion.cuda()
|
147 |
+
|
148 |
+
if eval:
|
149 |
+
self.eval()
|
150 |
+
assert not self.training
|
151 |
+
|
152 |
+
if not eval:
|
153 |
+
return criterion, state["step"]
|
154 |
+
return criterion
|
TTS/encoder/models/lstm.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
from TTS.encoder.models.base_encoder import BaseEncoder
|
5 |
+
|
6 |
+
|
7 |
+
class LSTMWithProjection(nn.Module):
|
8 |
+
def __init__(self, input_size, hidden_size, proj_size):
|
9 |
+
super().__init__()
|
10 |
+
self.input_size = input_size
|
11 |
+
self.hidden_size = hidden_size
|
12 |
+
self.proj_size = proj_size
|
13 |
+
self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
|
14 |
+
self.linear = nn.Linear(hidden_size, proj_size, bias=False)
|
15 |
+
|
16 |
+
def forward(self, x):
|
17 |
+
self.lstm.flatten_parameters()
|
18 |
+
o, (_, _) = self.lstm(x)
|
19 |
+
return self.linear(o)
|
20 |
+
|
21 |
+
|
22 |
+
class LSTMWithoutProjection(nn.Module):
|
23 |
+
def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
|
24 |
+
super().__init__()
|
25 |
+
self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
|
26 |
+
self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
|
27 |
+
self.relu = nn.ReLU()
|
28 |
+
|
29 |
+
def forward(self, x):
|
30 |
+
_, (hidden, _) = self.lstm(x)
|
31 |
+
return self.relu(self.linear(hidden[-1]))
|
32 |
+
|
33 |
+
|
34 |
+
class LSTMSpeakerEncoder(BaseEncoder):
|
35 |
+
def __init__(
|
36 |
+
self,
|
37 |
+
input_dim,
|
38 |
+
proj_dim=256,
|
39 |
+
lstm_dim=768,
|
40 |
+
num_lstm_layers=3,
|
41 |
+
use_lstm_with_projection=True,
|
42 |
+
use_torch_spec=False,
|
43 |
+
audio_config=None,
|
44 |
+
):
|
45 |
+
super().__init__()
|
46 |
+
self.use_lstm_with_projection = use_lstm_with_projection
|
47 |
+
self.use_torch_spec = use_torch_spec
|
48 |
+
self.audio_config = audio_config
|
49 |
+
self.proj_dim = proj_dim
|
50 |
+
|
51 |
+
layers = []
|
52 |
+
# choise LSTM layer
|
53 |
+
if use_lstm_with_projection:
|
54 |
+
layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
|
55 |
+
for _ in range(num_lstm_layers - 1):
|
56 |
+
layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
|
57 |
+
self.layers = nn.Sequential(*layers)
|
58 |
+
else:
|
59 |
+
self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
|
60 |
+
|
61 |
+
self.instancenorm = nn.InstanceNorm1d(input_dim)
|
62 |
+
|
63 |
+
if self.use_torch_spec:
|
64 |
+
self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
|
65 |
+
else:
|
66 |
+
self.torch_spec = None
|
67 |
+
|
68 |
+
self._init_layers()
|
69 |
+
|
70 |
+
def _init_layers(self):
|
71 |
+
for name, param in self.layers.named_parameters():
|
72 |
+
if "bias" in name:
|
73 |
+
nn.init.constant_(param, 0.0)
|
74 |
+
elif "weight" in name:
|
75 |
+
nn.init.xavier_normal_(param)
|
76 |
+
|
77 |
+
def forward(self, x, l2_norm=True):
|
78 |
+
"""Forward pass of the model.
|
79 |
+
|
80 |
+
Args:
|
81 |
+
x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
|
82 |
+
to compute the spectrogram on-the-fly.
|
83 |
+
l2_norm (bool): Whether to L2-normalize the outputs.
|
84 |
+
|
85 |
+
Shapes:
|
86 |
+
- x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
|
87 |
+
"""
|
88 |
+
with torch.no_grad():
|
89 |
+
with torch.cuda.amp.autocast(enabled=False):
|
90 |
+
if self.use_torch_spec:
|
91 |
+
x.squeeze_(1)
|
92 |
+
x = self.torch_spec(x)
|
93 |
+
x = self.instancenorm(x).transpose(1, 2)
|
94 |
+
d = self.layers(x)
|
95 |
+
if self.use_lstm_with_projection:
|
96 |
+
d = d[:, -1]
|
97 |
+
if l2_norm:
|
98 |
+
d = torch.nn.functional.normalize(d, p=2, dim=1)
|
99 |
+
return d
|
TTS/encoder/models/resnet.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
# from TTS.utils.audio import TorchSTFT
|
5 |
+
from TTS.encoder.models.base_encoder import BaseEncoder
|
6 |
+
|
7 |
+
|
8 |
+
class SELayer(nn.Module):
|
9 |
+
def __init__(self, channel, reduction=8):
|
10 |
+
super(SELayer, self).__init__()
|
11 |
+
self.avg_pool = nn.AdaptiveAvgPool2d(1)
|
12 |
+
self.fc = nn.Sequential(
|
13 |
+
nn.Linear(channel, channel // reduction),
|
14 |
+
nn.ReLU(inplace=True),
|
15 |
+
nn.Linear(channel // reduction, channel),
|
16 |
+
nn.Sigmoid(),
|
17 |
+
)
|
18 |
+
|
19 |
+
def forward(self, x):
|
20 |
+
b, c, _, _ = x.size()
|
21 |
+
y = self.avg_pool(x).view(b, c)
|
22 |
+
y = self.fc(y).view(b, c, 1, 1)
|
23 |
+
return x * y
|
24 |
+
|
25 |
+
|
26 |
+
class SEBasicBlock(nn.Module):
|
27 |
+
expansion = 1
|
28 |
+
|
29 |
+
def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
|
30 |
+
super(SEBasicBlock, self).__init__()
|
31 |
+
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
|
32 |
+
self.bn1 = nn.BatchNorm2d(planes)
|
33 |
+
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
|
34 |
+
self.bn2 = nn.BatchNorm2d(planes)
|
35 |
+
self.relu = nn.ReLU(inplace=True)
|
36 |
+
self.se = SELayer(planes, reduction)
|
37 |
+
self.downsample = downsample
|
38 |
+
self.stride = stride
|
39 |
+
|
40 |
+
def forward(self, x):
|
41 |
+
residual = x
|
42 |
+
|
43 |
+
out = self.conv1(x)
|
44 |
+
out = self.relu(out)
|
45 |
+
out = self.bn1(out)
|
46 |
+
|
47 |
+
out = self.conv2(out)
|
48 |
+
out = self.bn2(out)
|
49 |
+
out = self.se(out)
|
50 |
+
|
51 |
+
if self.downsample is not None:
|
52 |
+
residual = self.downsample(x)
|
53 |
+
|
54 |
+
out += residual
|
55 |
+
out = self.relu(out)
|
56 |
+
return out
|
57 |
+
|
58 |
+
|
59 |
+
class ResNetSpeakerEncoder(BaseEncoder):
|
60 |
+
"""Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
|
61 |
+
Adapted from: https://github.com/clovaai/voxceleb_trainer
|
62 |
+
"""
|
63 |
+
|
64 |
+
# pylint: disable=W0102
|
65 |
+
def __init__(
|
66 |
+
self,
|
67 |
+
input_dim=64,
|
68 |
+
proj_dim=512,
|
69 |
+
layers=[3, 4, 6, 3],
|
70 |
+
num_filters=[32, 64, 128, 256],
|
71 |
+
encoder_type="ASP",
|
72 |
+
log_input=False,
|
73 |
+
use_torch_spec=False,
|
74 |
+
audio_config=None,
|
75 |
+
):
|
76 |
+
super(ResNetSpeakerEncoder, self).__init__()
|
77 |
+
|
78 |
+
self.encoder_type = encoder_type
|
79 |
+
self.input_dim = input_dim
|
80 |
+
self.log_input = log_input
|
81 |
+
self.use_torch_spec = use_torch_spec
|
82 |
+
self.audio_config = audio_config
|
83 |
+
self.proj_dim = proj_dim
|
84 |
+
|
85 |
+
self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
|
86 |
+
self.relu = nn.ReLU(inplace=True)
|
87 |
+
self.bn1 = nn.BatchNorm2d(num_filters[0])
|
88 |
+
|
89 |
+
self.inplanes = num_filters[0]
|
90 |
+
self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
|
91 |
+
self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
|
92 |
+
self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
|
93 |
+
self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
|
94 |
+
|
95 |
+
self.instancenorm = nn.InstanceNorm1d(input_dim)
|
96 |
+
|
97 |
+
if self.use_torch_spec:
|
98 |
+
self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
|
99 |
+
else:
|
100 |
+
self.torch_spec = None
|
101 |
+
|
102 |
+
outmap_size = int(self.input_dim / 8)
|
103 |
+
|
104 |
+
self.attention = nn.Sequential(
|
105 |
+
nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
|
106 |
+
nn.ReLU(),
|
107 |
+
nn.BatchNorm1d(128),
|
108 |
+
nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
|
109 |
+
nn.Softmax(dim=2),
|
110 |
+
)
|
111 |
+
|
112 |
+
if self.encoder_type == "SAP":
|
113 |
+
out_dim = num_filters[3] * outmap_size
|
114 |
+
elif self.encoder_type == "ASP":
|
115 |
+
out_dim = num_filters[3] * outmap_size * 2
|
116 |
+
else:
|
117 |
+
raise ValueError("Undefined encoder")
|
118 |
+
|
119 |
+
self.fc = nn.Linear(out_dim, proj_dim)
|
120 |
+
|
121 |
+
self._init_layers()
|
122 |
+
|
123 |
+
def _init_layers(self):
|
124 |
+
for m in self.modules():
|
125 |
+
if isinstance(m, nn.Conv2d):
|
126 |
+
nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
|
127 |
+
elif isinstance(m, nn.BatchNorm2d):
|
128 |
+
nn.init.constant_(m.weight, 1)
|
129 |
+
nn.init.constant_(m.bias, 0)
|
130 |
+
|
131 |
+
def create_layer(self, block, planes, blocks, stride=1):
|
132 |
+
downsample = None
|
133 |
+
if stride != 1 or self.inplanes != planes * block.expansion:
|
134 |
+
downsample = nn.Sequential(
|
135 |
+
nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
|
136 |
+
nn.BatchNorm2d(planes * block.expansion),
|
137 |
+
)
|
138 |
+
|
139 |
+
layers = []
|
140 |
+
layers.append(block(self.inplanes, planes, stride, downsample))
|
141 |
+
self.inplanes = planes * block.expansion
|
142 |
+
for _ in range(1, blocks):
|
143 |
+
layers.append(block(self.inplanes, planes))
|
144 |
+
|
145 |
+
return nn.Sequential(*layers)
|
146 |
+
|
147 |
+
# pylint: disable=R0201
|
148 |
+
def new_parameter(self, *size):
|
149 |
+
out = nn.Parameter(torch.FloatTensor(*size))
|
150 |
+
nn.init.xavier_normal_(out)
|
151 |
+
return out
|
152 |
+
|
153 |
+
def forward(self, x, l2_norm=False):
|
154 |
+
"""Forward pass of the model.
|
155 |
+
|
156 |
+
Args:
|
157 |
+
x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
|
158 |
+
to compute the spectrogram on-the-fly.
|
159 |
+
l2_norm (bool): Whether to L2-normalize the outputs.
|
160 |
+
|
161 |
+
Shapes:
|
162 |
+
- x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
|
163 |
+
"""
|
164 |
+
with torch.no_grad():
|
165 |
+
with torch.cuda.amp.autocast(enabled=False):
|
166 |
+
x.squeeze_(1)
|
167 |
+
# if you torch spec compute it otherwise use the mel spec computed by the AP
|
168 |
+
if self.use_torch_spec:
|
169 |
+
x = self.torch_spec(x)
|
170 |
+
|
171 |
+
if self.log_input:
|
172 |
+
x = (x + 1e-6).log()
|
173 |
+
x = self.instancenorm(x).unsqueeze(1)
|
174 |
+
|
175 |
+
x = self.conv1(x)
|
176 |
+
x = self.relu(x)
|
177 |
+
x = self.bn1(x)
|
178 |
+
|
179 |
+
x = self.layer1(x)
|
180 |
+
x = self.layer2(x)
|
181 |
+
x = self.layer3(x)
|
182 |
+
x = self.layer4(x)
|
183 |
+
|
184 |
+
x = x.reshape(x.size()[0], -1, x.size()[-1])
|
185 |
+
|
186 |
+
w = self.attention(x)
|
187 |
+
|
188 |
+
if self.encoder_type == "SAP":
|
189 |
+
x = torch.sum(x * w, dim=2)
|
190 |
+
elif self.encoder_type == "ASP":
|
191 |
+
mu = torch.sum(x * w, dim=2)
|
192 |
+
sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5))
|
193 |
+
x = torch.cat((mu, sg), 1)
|
194 |
+
|
195 |
+
x = x.view(x.size()[0], -1)
|
196 |
+
x = self.fc(x)
|
197 |
+
|
198 |
+
if l2_norm:
|
199 |
+
x = torch.nn.functional.normalize(x, p=2, dim=1)
|
200 |
+
return x
|
TTS/encoder/requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
umap-learn
|
2 |
+
numpy>=1.17.0
|
TTS/encoder/utils/__init__.py
ADDED
File without changes
|
TTS/encoder/utils/generic_utils.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import glob
|
3 |
+
import os
|
4 |
+
import random
|
5 |
+
import re
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
from scipy import signal
|
9 |
+
|
10 |
+
from TTS.encoder.models.lstm import LSTMSpeakerEncoder
|
11 |
+
from TTS.encoder.models.resnet import ResNetSpeakerEncoder
|
12 |
+
from TTS.utils.io import save_fsspec
|
13 |
+
|
14 |
+
|
15 |
+
class AugmentWAV(object):
|
16 |
+
def __init__(self, ap, augmentation_config):
|
17 |
+
|
18 |
+
self.ap = ap
|
19 |
+
self.use_additive_noise = False
|
20 |
+
|
21 |
+
if "additive" in augmentation_config.keys():
|
22 |
+
self.additive_noise_config = augmentation_config["additive"]
|
23 |
+
additive_path = self.additive_noise_config["sounds_path"]
|
24 |
+
if additive_path:
|
25 |
+
self.use_additive_noise = True
|
26 |
+
# get noise types
|
27 |
+
self.additive_noise_types = []
|
28 |
+
for key in self.additive_noise_config.keys():
|
29 |
+
if isinstance(self.additive_noise_config[key], dict):
|
30 |
+
self.additive_noise_types.append(key)
|
31 |
+
|
32 |
+
additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
|
33 |
+
|
34 |
+
self.noise_list = {}
|
35 |
+
|
36 |
+
for wav_file in additive_files:
|
37 |
+
noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
|
38 |
+
# ignore not listed directories
|
39 |
+
if noise_dir not in self.additive_noise_types:
|
40 |
+
continue
|
41 |
+
if not noise_dir in self.noise_list:
|
42 |
+
self.noise_list[noise_dir] = []
|
43 |
+
self.noise_list[noise_dir].append(wav_file)
|
44 |
+
|
45 |
+
print(
|
46 |
+
f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
|
47 |
+
)
|
48 |
+
|
49 |
+
self.use_rir = False
|
50 |
+
|
51 |
+
if "rir" in augmentation_config.keys():
|
52 |
+
self.rir_config = augmentation_config["rir"]
|
53 |
+
if self.rir_config["rir_path"]:
|
54 |
+
self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
|
55 |
+
self.use_rir = True
|
56 |
+
|
57 |
+
print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
|
58 |
+
|
59 |
+
self.create_augmentation_global_list()
|
60 |
+
|
61 |
+
def create_augmentation_global_list(self):
|
62 |
+
if self.use_additive_noise:
|
63 |
+
self.global_noise_list = self.additive_noise_types
|
64 |
+
else:
|
65 |
+
self.global_noise_list = []
|
66 |
+
if self.use_rir:
|
67 |
+
self.global_noise_list.append("RIR_AUG")
|
68 |
+
|
69 |
+
def additive_noise(self, noise_type, audio):
|
70 |
+
|
71 |
+
clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
|
72 |
+
|
73 |
+
noise_list = random.sample(
|
74 |
+
self.noise_list[noise_type],
|
75 |
+
random.randint(
|
76 |
+
self.additive_noise_config[noise_type]["min_num_noises"],
|
77 |
+
self.additive_noise_config[noise_type]["max_num_noises"],
|
78 |
+
),
|
79 |
+
)
|
80 |
+
|
81 |
+
audio_len = audio.shape[0]
|
82 |
+
noises_wav = None
|
83 |
+
for noise in noise_list:
|
84 |
+
noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
|
85 |
+
|
86 |
+
if noiseaudio.shape[0] < audio_len:
|
87 |
+
continue
|
88 |
+
|
89 |
+
noise_snr = random.uniform(
|
90 |
+
self.additive_noise_config[noise_type]["min_snr_in_db"],
|
91 |
+
self.additive_noise_config[noise_type]["max_num_noises"],
|
92 |
+
)
|
93 |
+
noise_db = 10 * np.log10(np.mean(noiseaudio**2) + 1e-4)
|
94 |
+
noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
|
95 |
+
|
96 |
+
if noises_wav is None:
|
97 |
+
noises_wav = noise_wav
|
98 |
+
else:
|
99 |
+
noises_wav += noise_wav
|
100 |
+
|
101 |
+
# if all possible files is less than audio, choose other files
|
102 |
+
if noises_wav is None:
|
103 |
+
return self.additive_noise(noise_type, audio)
|
104 |
+
|
105 |
+
return audio + noises_wav
|
106 |
+
|
107 |
+
def reverberate(self, audio):
|
108 |
+
audio_len = audio.shape[0]
|
109 |
+
|
110 |
+
rir_file = random.choice(self.rir_files)
|
111 |
+
rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
|
112 |
+
rir = rir / np.sqrt(np.sum(rir**2))
|
113 |
+
return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
|
114 |
+
|
115 |
+
def apply_one(self, audio):
|
116 |
+
noise_type = random.choice(self.global_noise_list)
|
117 |
+
if noise_type == "RIR_AUG":
|
118 |
+
return self.reverberate(audio)
|
119 |
+
|
120 |
+
return self.additive_noise(noise_type, audio)
|
121 |
+
|
122 |
+
|
123 |
+
def to_camel(text):
|
124 |
+
text = text.capitalize()
|
125 |
+
return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
|
126 |
+
|
127 |
+
|
128 |
+
def setup_encoder_model(config: "Coqpit"):
|
129 |
+
if config.model_params["model_name"].lower() == "lstm":
|
130 |
+
model = LSTMSpeakerEncoder(
|
131 |
+
config.model_params["input_dim"],
|
132 |
+
config.model_params["proj_dim"],
|
133 |
+
config.model_params["lstm_dim"],
|
134 |
+
config.model_params["num_lstm_layers"],
|
135 |
+
use_torch_spec=config.model_params.get("use_torch_spec", False),
|
136 |
+
audio_config=config.audio,
|
137 |
+
)
|
138 |
+
elif config.model_params["model_name"].lower() == "resnet":
|
139 |
+
model = ResNetSpeakerEncoder(
|
140 |
+
input_dim=config.model_params["input_dim"],
|
141 |
+
proj_dim=config.model_params["proj_dim"],
|
142 |
+
log_input=config.model_params.get("log_input", False),
|
143 |
+
use_torch_spec=config.model_params.get("use_torch_spec", False),
|
144 |
+
audio_config=config.audio,
|
145 |
+
)
|
146 |
+
return model
|
147 |
+
|
148 |
+
|
149 |
+
def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
|
150 |
+
checkpoint_path = "checkpoint_{}.pth".format(current_step)
|
151 |
+
checkpoint_path = os.path.join(out_path, checkpoint_path)
|
152 |
+
print(" | | > Checkpoint saving : {}".format(checkpoint_path))
|
153 |
+
|
154 |
+
new_state_dict = model.state_dict()
|
155 |
+
state = {
|
156 |
+
"model": new_state_dict,
|
157 |
+
"optimizer": optimizer.state_dict() if optimizer is not None else None,
|
158 |
+
"criterion": criterion.state_dict(),
|
159 |
+
"step": current_step,
|
160 |
+
"epoch": epoch,
|
161 |
+
"loss": model_loss,
|
162 |
+
"date": datetime.date.today().strftime("%B %d, %Y"),
|
163 |
+
}
|
164 |
+
save_fsspec(state, checkpoint_path)
|
165 |
+
|
166 |
+
|
167 |
+
def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
|
168 |
+
if model_loss < best_loss:
|
169 |
+
new_state_dict = model.state_dict()
|
170 |
+
state = {
|
171 |
+
"model": new_state_dict,
|
172 |
+
"optimizer": optimizer.state_dict(),
|
173 |
+
"criterion": criterion.state_dict(),
|
174 |
+
"step": current_step,
|
175 |
+
"epoch": epoch,
|
176 |
+
"loss": model_loss,
|
177 |
+
"date": datetime.date.today().strftime("%B %d, %Y"),
|
178 |
+
}
|
179 |
+
best_loss = model_loss
|
180 |
+
bestmodel_path = "best_model.pth"
|
181 |
+
bestmodel_path = os.path.join(out_path, bestmodel_path)
|
182 |
+
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
|
183 |
+
save_fsspec(state, bestmodel_path)
|
184 |
+
return best_loss
|
TTS/encoder/utils/io.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import os
|
3 |
+
|
4 |
+
from TTS.utils.io import save_fsspec
|
5 |
+
|
6 |
+
|
7 |
+
def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
|
8 |
+
checkpoint_path = "checkpoint_{}.pth".format(current_step)
|
9 |
+
checkpoint_path = os.path.join(out_path, checkpoint_path)
|
10 |
+
print(" | | > Checkpoint saving : {}".format(checkpoint_path))
|
11 |
+
|
12 |
+
new_state_dict = model.state_dict()
|
13 |
+
state = {
|
14 |
+
"model": new_state_dict,
|
15 |
+
"optimizer": optimizer.state_dict() if optimizer is not None else None,
|
16 |
+
"step": current_step,
|
17 |
+
"loss": model_loss,
|
18 |
+
"date": datetime.date.today().strftime("%B %d, %Y"),
|
19 |
+
}
|
20 |
+
save_fsspec(state, checkpoint_path)
|
21 |
+
|
22 |
+
|
23 |
+
def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step):
|
24 |
+
if model_loss < best_loss:
|
25 |
+
new_state_dict = model.state_dict()
|
26 |
+
state = {
|
27 |
+
"model": new_state_dict,
|
28 |
+
"optimizer": optimizer.state_dict(),
|
29 |
+
"step": current_step,
|
30 |
+
"loss": model_loss,
|
31 |
+
"date": datetime.date.today().strftime("%B %d, %Y"),
|
32 |
+
}
|
33 |
+
best_loss = model_loss
|
34 |
+
bestmodel_path = "best_model.pth"
|
35 |
+
bestmodel_path = os.path.join(out_path, bestmodel_path)
|
36 |
+
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
|
37 |
+
save_fsspec(state, bestmodel_path)
|
38 |
+
return best_loss
|
TTS/encoder/utils/prepare_voxceleb.py
ADDED
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# ==============================================================================
|
17 |
+
# Only support eager mode and TF>=2.0.0
|
18 |
+
# pylint: disable=no-member, invalid-name, relative-beyond-top-level
|
19 |
+
# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
|
20 |
+
""" voxceleb 1 & 2 """
|
21 |
+
|
22 |
+
import hashlib
|
23 |
+
import os
|
24 |
+
import subprocess
|
25 |
+
import sys
|
26 |
+
import zipfile
|
27 |
+
|
28 |
+
import pandas
|
29 |
+
import soundfile as sf
|
30 |
+
from absl import logging
|
31 |
+
|
32 |
+
SUBSETS = {
|
33 |
+
"vox1_dev_wav": [
|
34 |
+
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
|
35 |
+
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab",
|
36 |
+
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac",
|
37 |
+
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad",
|
38 |
+
],
|
39 |
+
"vox1_test_wav": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"],
|
40 |
+
"vox2_dev_aac": [
|
41 |
+
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa",
|
42 |
+
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab",
|
43 |
+
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac",
|
44 |
+
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad",
|
45 |
+
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae",
|
46 |
+
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf",
|
47 |
+
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag",
|
48 |
+
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah",
|
49 |
+
],
|
50 |
+
"vox2_test_aac": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"],
|
51 |
+
}
|
52 |
+
|
53 |
+
MD5SUM = {
|
54 |
+
"vox1_dev_wav": "ae63e55b951748cc486645f532ba230b",
|
55 |
+
"vox2_dev_aac": "bbc063c46078a602ca71605645c2a402",
|
56 |
+
"vox1_test_wav": "185fdc63c3c739954633d50379a3d102",
|
57 |
+
"vox2_test_aac": "0d2b3ea430a821c33263b5ea37ede312",
|
58 |
+
}
|
59 |
+
|
60 |
+
USER = {"user": "", "password": ""}
|
61 |
+
|
62 |
+
speaker_id_dict = {}
|
63 |
+
|
64 |
+
|
65 |
+
def download_and_extract(directory, subset, urls):
|
66 |
+
"""Download and extract the given split of dataset.
|
67 |
+
|
68 |
+
Args:
|
69 |
+
directory: the directory where to put the downloaded data.
|
70 |
+
subset: subset name of the corpus.
|
71 |
+
urls: the list of urls to download the data file.
|
72 |
+
"""
|
73 |
+
os.makedirs(directory, exist_ok=True)
|
74 |
+
|
75 |
+
try:
|
76 |
+
for url in urls:
|
77 |
+
zip_filepath = os.path.join(directory, url.split("/")[-1])
|
78 |
+
if os.path.exists(zip_filepath):
|
79 |
+
continue
|
80 |
+
logging.info("Downloading %s to %s" % (url, zip_filepath))
|
81 |
+
subprocess.call(
|
82 |
+
"wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
|
83 |
+
shell=True,
|
84 |
+
)
|
85 |
+
|
86 |
+
statinfo = os.stat(zip_filepath)
|
87 |
+
logging.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
|
88 |
+
|
89 |
+
# concatenate all parts into zip files
|
90 |
+
if ".zip" not in zip_filepath:
|
91 |
+
zip_filepath = "_".join(zip_filepath.split("_")[:-1])
|
92 |
+
subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True)
|
93 |
+
zip_filepath += ".zip"
|
94 |
+
extract_path = zip_filepath.strip(".zip")
|
95 |
+
|
96 |
+
# check zip file md5sum
|
97 |
+
with open(zip_filepath, "rb") as f_zip:
|
98 |
+
md5 = hashlib.md5(f_zip.read()).hexdigest()
|
99 |
+
if md5 != MD5SUM[subset]:
|
100 |
+
raise ValueError("md5sum of %s mismatch" % zip_filepath)
|
101 |
+
|
102 |
+
with zipfile.ZipFile(zip_filepath, "r") as zfile:
|
103 |
+
zfile.extractall(directory)
|
104 |
+
extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename)
|
105 |
+
subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True)
|
106 |
+
finally:
|
107 |
+
# os.remove(zip_filepath)
|
108 |
+
pass
|
109 |
+
|
110 |
+
|
111 |
+
def exec_cmd(cmd):
|
112 |
+
"""Run a command in a subprocess.
|
113 |
+
Args:
|
114 |
+
cmd: command line to be executed.
|
115 |
+
Return:
|
116 |
+
int, the return code.
|
117 |
+
"""
|
118 |
+
try:
|
119 |
+
retcode = subprocess.call(cmd, shell=True)
|
120 |
+
if retcode < 0:
|
121 |
+
logging.info(f"Child was terminated by signal {retcode}")
|
122 |
+
except OSError as e:
|
123 |
+
logging.info(f"Execution failed: {e}")
|
124 |
+
retcode = -999
|
125 |
+
return retcode
|
126 |
+
|
127 |
+
|
128 |
+
def decode_aac_with_ffmpeg(aac_file, wav_file):
|
129 |
+
"""Decode a given AAC file into WAV using ffmpeg.
|
130 |
+
Args:
|
131 |
+
aac_file: file path to input AAC file.
|
132 |
+
wav_file: file path to output WAV file.
|
133 |
+
Return:
|
134 |
+
bool, True if success.
|
135 |
+
"""
|
136 |
+
cmd = f"ffmpeg -i {aac_file} {wav_file}"
|
137 |
+
logging.info(f"Decoding aac file using command line: {cmd}")
|
138 |
+
ret = exec_cmd(cmd)
|
139 |
+
if ret != 0:
|
140 |
+
logging.error(f"Failed to decode aac file with retcode {ret}")
|
141 |
+
logging.error("Please check your ffmpeg installation.")
|
142 |
+
return False
|
143 |
+
return True
|
144 |
+
|
145 |
+
|
146 |
+
def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
|
147 |
+
"""Optionally convert AAC to WAV and make speaker labels.
|
148 |
+
Args:
|
149 |
+
input_dir: the directory which holds the input dataset.
|
150 |
+
subset: the name of the specified subset. e.g. vox1_dev_wav
|
151 |
+
output_dir: the directory to place the newly generated csv files.
|
152 |
+
output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
|
153 |
+
"""
|
154 |
+
|
155 |
+
logging.info("Preprocessing audio and label for subset %s" % subset)
|
156 |
+
source_dir = os.path.join(input_dir, subset)
|
157 |
+
|
158 |
+
files = []
|
159 |
+
# Convert all AAC file into WAV format. At the same time, generate the csv
|
160 |
+
for root, _, filenames in os.walk(source_dir):
|
161 |
+
for filename in filenames:
|
162 |
+
name, ext = os.path.splitext(filename)
|
163 |
+
if ext.lower() == ".wav":
|
164 |
+
_, ext2 = os.path.splitext(name)
|
165 |
+
if ext2:
|
166 |
+
continue
|
167 |
+
wav_file = os.path.join(root, filename)
|
168 |
+
elif ext.lower() == ".m4a":
|
169 |
+
# Convert AAC to WAV.
|
170 |
+
aac_file = os.path.join(root, filename)
|
171 |
+
wav_file = aac_file + ".wav"
|
172 |
+
if not os.path.exists(wav_file):
|
173 |
+
if not decode_aac_with_ffmpeg(aac_file, wav_file):
|
174 |
+
raise RuntimeError("Audio decoding failed.")
|
175 |
+
else:
|
176 |
+
continue
|
177 |
+
speaker_name = root.split(os.path.sep)[-2]
|
178 |
+
if speaker_name not in speaker_id_dict:
|
179 |
+
num = len(speaker_id_dict)
|
180 |
+
speaker_id_dict[speaker_name] = num
|
181 |
+
# wav_filesize = os.path.getsize(wav_file)
|
182 |
+
wav_length = len(sf.read(wav_file)[0])
|
183 |
+
files.append((os.path.abspath(wav_file), wav_length, speaker_id_dict[speaker_name], speaker_name))
|
184 |
+
|
185 |
+
# Write to CSV file which contains four columns:
|
186 |
+
# "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
|
187 |
+
csv_file_path = os.path.join(output_dir, output_file)
|
188 |
+
df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
|
189 |
+
df.to_csv(csv_file_path, index=False, sep="\t")
|
190 |
+
logging.info("Successfully generated csv file {}".format(csv_file_path))
|
191 |
+
|
192 |
+
|
193 |
+
def processor(directory, subset, force_process):
|
194 |
+
"""download and process"""
|
195 |
+
urls = SUBSETS
|
196 |
+
if subset not in urls:
|
197 |
+
raise ValueError(subset, "is not in voxceleb")
|
198 |
+
|
199 |
+
subset_csv = os.path.join(directory, subset + ".csv")
|
200 |
+
if not force_process and os.path.exists(subset_csv):
|
201 |
+
return subset_csv
|
202 |
+
|
203 |
+
logging.info("Downloading and process the voxceleb in %s", directory)
|
204 |
+
logging.info("Preparing subset %s", subset)
|
205 |
+
download_and_extract(directory, subset, urls[subset])
|
206 |
+
convert_audio_and_make_label(directory, subset, directory, subset + ".csv")
|
207 |
+
logging.info("Finished downloading and processing")
|
208 |
+
return subset_csv
|
209 |
+
|
210 |
+
|
211 |
+
if __name__ == "__main__":
|
212 |
+
logging.set_verbosity(logging.INFO)
|
213 |
+
if len(sys.argv) != 4:
|
214 |
+
print("Usage: python prepare_data.py save_directory user password")
|
215 |
+
sys.exit()
|
216 |
+
|
217 |
+
DIR, USER["user"], USER["password"] = sys.argv[1], sys.argv[2], sys.argv[3]
|
218 |
+
for SUBSET in SUBSETS:
|
219 |
+
processor(DIR, SUBSET, False)
|
TTS/encoder/utils/samplers.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
from torch.utils.data.sampler import Sampler, SubsetRandomSampler
|
4 |
+
|
5 |
+
|
6 |
+
class SubsetSampler(Sampler):
|
7 |
+
"""
|
8 |
+
Samples elements sequentially from a given list of indices.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
indices (list): a sequence of indices
|
12 |
+
"""
|
13 |
+
|
14 |
+
def __init__(self, indices):
|
15 |
+
super().__init__(indices)
|
16 |
+
self.indices = indices
|
17 |
+
|
18 |
+
def __iter__(self):
|
19 |
+
return (self.indices[i] for i in range(len(self.indices)))
|
20 |
+
|
21 |
+
def __len__(self):
|
22 |
+
return len(self.indices)
|
23 |
+
|
24 |
+
|
25 |
+
class PerfectBatchSampler(Sampler):
|
26 |
+
"""
|
27 |
+
Samples a mini-batch of indices for a balanced class batching
|
28 |
+
|
29 |
+
Args:
|
30 |
+
dataset_items(list): dataset items to sample from.
|
31 |
+
classes (list): list of classes of dataset_items to sample from.
|
32 |
+
batch_size (int): total number of samples to be sampled in a mini-batch.
|
33 |
+
num_gpus (int): number of GPU in the data parallel mode.
|
34 |
+
shuffle (bool): if True, samples randomly, otherwise samples sequentially.
|
35 |
+
drop_last (bool): if True, drops last incomplete batch.
|
36 |
+
"""
|
37 |
+
|
38 |
+
def __init__(
|
39 |
+
self,
|
40 |
+
dataset_items,
|
41 |
+
classes,
|
42 |
+
batch_size,
|
43 |
+
num_classes_in_batch,
|
44 |
+
num_gpus=1,
|
45 |
+
shuffle=True,
|
46 |
+
drop_last=False,
|
47 |
+
label_key="class_name",
|
48 |
+
):
|
49 |
+
super().__init__(dataset_items)
|
50 |
+
assert (
|
51 |
+
batch_size % (num_classes_in_batch * num_gpus) == 0
|
52 |
+
), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."
|
53 |
+
|
54 |
+
label_indices = {}
|
55 |
+
for idx, item in enumerate(dataset_items):
|
56 |
+
label = item[label_key]
|
57 |
+
if label not in label_indices.keys():
|
58 |
+
label_indices[label] = [idx]
|
59 |
+
else:
|
60 |
+
label_indices[label].append(idx)
|
61 |
+
|
62 |
+
if shuffle:
|
63 |
+
self._samplers = [SubsetRandomSampler(label_indices[key]) for key in classes]
|
64 |
+
else:
|
65 |
+
self._samplers = [SubsetSampler(label_indices[key]) for key in classes]
|
66 |
+
|
67 |
+
self._batch_size = batch_size
|
68 |
+
self._drop_last = drop_last
|
69 |
+
self._dp_devices = num_gpus
|
70 |
+
self._num_classes_in_batch = num_classes_in_batch
|
71 |
+
|
72 |
+
def __iter__(self):
|
73 |
+
|
74 |
+
batch = []
|
75 |
+
if self._num_classes_in_batch != len(self._samplers):
|
76 |
+
valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)
|
77 |
+
else:
|
78 |
+
valid_samplers_idx = None
|
79 |
+
|
80 |
+
iters = [iter(s) for s in self._samplers]
|
81 |
+
done = False
|
82 |
+
|
83 |
+
while True:
|
84 |
+
b = []
|
85 |
+
for i, it in enumerate(iters):
|
86 |
+
if valid_samplers_idx is not None and i not in valid_samplers_idx:
|
87 |
+
continue
|
88 |
+
idx = next(it, None)
|
89 |
+
if idx is None:
|
90 |
+
done = True
|
91 |
+
break
|
92 |
+
b.append(idx)
|
93 |
+
if done:
|
94 |
+
break
|
95 |
+
batch += b
|
96 |
+
if len(batch) == self._batch_size:
|
97 |
+
yield batch
|
98 |
+
batch = []
|
99 |
+
if valid_samplers_idx is not None:
|
100 |
+
valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)
|
101 |
+
|
102 |
+
if not self._drop_last:
|
103 |
+
if len(batch) > 0:
|
104 |
+
groups = len(batch) // self._num_classes_in_batch
|
105 |
+
if groups % self._dp_devices == 0:
|
106 |
+
yield batch
|
107 |
+
else:
|
108 |
+
batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
|
109 |
+
if len(batch) > 0:
|
110 |
+
yield batch
|
111 |
+
|
112 |
+
def __len__(self):
|
113 |
+
class_batch_size = self._batch_size // self._num_classes_in_batch
|
114 |
+
return min(((len(s) + class_batch_size - 1) // class_batch_size) for s in self._samplers)
|
TTS/encoder/utils/training.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dataclasses import dataclass, field
|
3 |
+
|
4 |
+
from coqpit import Coqpit
|
5 |
+
from trainer import TrainerArgs, get_last_checkpoint
|
6 |
+
from trainer.logging import logger_factory
|
7 |
+
from trainer.logging.console_logger import ConsoleLogger
|
8 |
+
|
9 |
+
from TTS.config import load_config, register_config
|
10 |
+
from TTS.tts.utils.text.characters import parse_symbols
|
11 |
+
from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
|
12 |
+
from TTS.utils.io import copy_model_files
|
13 |
+
|
14 |
+
|
15 |
+
@dataclass
|
16 |
+
class TrainArgs(TrainerArgs):
|
17 |
+
config_path: str = field(default=None, metadata={"help": "Path to the config file."})
|
18 |
+
|
19 |
+
|
20 |
+
def getarguments():
|
21 |
+
train_config = TrainArgs()
|
22 |
+
parser = train_config.init_argparse(arg_prefix="")
|
23 |
+
return parser
|
24 |
+
|
25 |
+
|
26 |
+
def process_args(args, config=None):
|
27 |
+
"""Process parsed comand line arguments and initialize the config if not provided.
|
28 |
+
Args:
|
29 |
+
args (argparse.Namespace or dict like): Parsed input arguments.
|
30 |
+
config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
|
31 |
+
Returns:
|
32 |
+
c (TTS.utils.io.AttrDict): Config paramaters.
|
33 |
+
out_path (str): Path to save models and logging.
|
34 |
+
audio_path (str): Path to save generated test audios.
|
35 |
+
c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
|
36 |
+
logging to the console.
|
37 |
+
dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging
|
38 |
+
TODO:
|
39 |
+
- Interactive config definition.
|
40 |
+
"""
|
41 |
+
if isinstance(args, tuple):
|
42 |
+
args, coqpit_overrides = args
|
43 |
+
if args.continue_path:
|
44 |
+
# continue a previous training from its output folder
|
45 |
+
experiment_path = args.continue_path
|
46 |
+
args.config_path = os.path.join(args.continue_path, "config.json")
|
47 |
+
args.restore_path, best_model = get_last_checkpoint(args.continue_path)
|
48 |
+
if not args.best_path:
|
49 |
+
args.best_path = best_model
|
50 |
+
# init config if not already defined
|
51 |
+
if config is None:
|
52 |
+
if args.config_path:
|
53 |
+
# init from a file
|
54 |
+
config = load_config(args.config_path)
|
55 |
+
else:
|
56 |
+
# init from console args
|
57 |
+
from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
|
58 |
+
|
59 |
+
config_base = BaseTrainingConfig()
|
60 |
+
config_base.parse_known_args(coqpit_overrides)
|
61 |
+
config = register_config(config_base.model)()
|
62 |
+
# override values from command-line args
|
63 |
+
config.parse_known_args(coqpit_overrides, relaxed_parser=True)
|
64 |
+
experiment_path = args.continue_path
|
65 |
+
if not experiment_path:
|
66 |
+
experiment_path = get_experiment_folder_path(config.output_path, config.run_name)
|
67 |
+
audio_path = os.path.join(experiment_path, "test_audios")
|
68 |
+
config.output_log_path = experiment_path
|
69 |
+
# setup rank 0 process in distributed training
|
70 |
+
dashboard_logger = None
|
71 |
+
if args.rank == 0:
|
72 |
+
new_fields = {}
|
73 |
+
if args.restore_path:
|
74 |
+
new_fields["restore_path"] = args.restore_path
|
75 |
+
new_fields["github_branch"] = get_git_branch()
|
76 |
+
# if model characters are not set in the config file
|
77 |
+
# save the default set to the config file for future
|
78 |
+
# compatibility.
|
79 |
+
if config.has("characters") and config.characters is None:
|
80 |
+
used_characters = parse_symbols()
|
81 |
+
new_fields["characters"] = used_characters
|
82 |
+
copy_model_files(config, experiment_path, new_fields)
|
83 |
+
dashboard_logger = logger_factory(config, experiment_path)
|
84 |
+
c_logger = ConsoleLogger()
|
85 |
+
return config, experiment_path, audio_path, c_logger, dashboard_logger
|
86 |
+
|
87 |
+
|
88 |
+
def init_arguments():
|
89 |
+
train_config = TrainArgs()
|
90 |
+
parser = train_config.init_argparse(arg_prefix="")
|
91 |
+
return parser
|
92 |
+
|
93 |
+
|
94 |
+
def init_training(config: Coqpit = None):
|
95 |
+
"""Initialization of a training run."""
|
96 |
+
parser = init_arguments()
|
97 |
+
args = parser.parse_known_args()
|
98 |
+
config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args(args, config)
|
99 |
+
return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger
|
TTS/encoder/utils/visual.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import numpy as np
|
4 |
+
import umap
|
5 |
+
|
6 |
+
matplotlib.use("Agg")
|
7 |
+
|
8 |
+
|
9 |
+
colormap = (
|
10 |
+
np.array(
|
11 |
+
[
|
12 |
+
[76, 255, 0],
|
13 |
+
[0, 127, 70],
|
14 |
+
[255, 0, 0],
|
15 |
+
[255, 217, 38],
|
16 |
+
[0, 135, 255],
|
17 |
+
[165, 0, 165],
|
18 |
+
[255, 167, 255],
|
19 |
+
[0, 255, 255],
|
20 |
+
[255, 96, 38],
|
21 |
+
[142, 76, 0],
|
22 |
+
[33, 0, 127],
|
23 |
+
[0, 0, 0],
|
24 |
+
[183, 183, 183],
|
25 |
+
],
|
26 |
+
dtype=np.float,
|
27 |
+
)
|
28 |
+
/ 255
|
29 |
+
)
|
30 |
+
|
31 |
+
|
32 |
+
def plot_embeddings(embeddings, num_classes_in_batch):
|
33 |
+
num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
|
34 |
+
|
35 |
+
# if necessary get just the first 10 classes
|
36 |
+
if num_classes_in_batch > 10:
|
37 |
+
num_classes_in_batch = 10
|
38 |
+
embeddings = embeddings[: num_classes_in_batch * num_utter_per_class]
|
39 |
+
|
40 |
+
model = umap.UMAP()
|
41 |
+
projection = model.fit_transform(embeddings)
|
42 |
+
ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class)
|
43 |
+
colors = [colormap[i] for i in ground_truth]
|
44 |
+
fig, ax = plt.subplots(figsize=(16, 10))
|
45 |
+
_ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
|
46 |
+
plt.gca().set_aspect("equal", "datalim")
|
47 |
+
plt.title("UMAP projection")
|
48 |
+
plt.tight_layout()
|
49 |
+
plt.savefig("umap")
|
50 |
+
return fig
|
TTS/model.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import abstractmethod
|
2 |
+
from typing import Dict
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from coqpit import Coqpit
|
6 |
+
from trainer import TrainerModel
|
7 |
+
|
8 |
+
# pylint: skip-file
|
9 |
+
|
10 |
+
|
11 |
+
class BaseTrainerModel(TrainerModel):
|
12 |
+
"""BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
|
13 |
+
|
14 |
+
Every new 🐸TTS model must inherit it.
|
15 |
+
"""
|
16 |
+
|
17 |
+
@staticmethod
|
18 |
+
@abstractmethod
|
19 |
+
def init_from_config(config: Coqpit):
|
20 |
+
"""Init the model and all its attributes from the given config.
|
21 |
+
|
22 |
+
Override this depending on your model.
|
23 |
+
"""
|
24 |
+
...
|
25 |
+
|
26 |
+
@abstractmethod
|
27 |
+
def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
|
28 |
+
"""Forward pass for inference.
|
29 |
+
|
30 |
+
It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
|
31 |
+
is considered to be the main output and you can add any other auxiliary outputs as you want.
|
32 |
+
|
33 |
+
We don't use `*kwargs` since it is problematic with the TorchScript API.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
input (torch.Tensor): [description]
|
37 |
+
aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc.
|
38 |
+
|
39 |
+
Returns:
|
40 |
+
Dict: [description]
|
41 |
+
"""
|
42 |
+
outputs_dict = {"model_outputs": None}
|
43 |
+
...
|
44 |
+
return outputs_dict
|
45 |
+
|
46 |
+
@abstractmethod
|
47 |
+
def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
|
48 |
+
"""Load a model checkpoint gile and get ready for training or inference.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
config (Coqpit): Model configuration.
|
52 |
+
checkpoint_path (str): Path to the model checkpoint file.
|
53 |
+
eval (bool, optional): If true, init model for inference else for training. Defaults to False.
|
54 |
+
strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
|
55 |
+
"""
|
56 |
+
...
|
TTS/server/README.md
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# :frog: TTS demo server
|
2 |
+
Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below.
|
3 |
+
|
4 |
+
**Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
|
5 |
+
|
6 |
+
Examples runs:
|
7 |
+
|
8 |
+
List officially released models.
|
9 |
+
```python TTS/server/server.py --list_models ```
|
10 |
+
|
11 |
+
Run the server with the official models.
|
12 |
+
```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
|
13 |
+
|
14 |
+
Run the server with the official models on a GPU.
|
15 |
+
```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
|
16 |
+
|
17 |
+
Run the server with a custom models.
|
18 |
+
```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
|
TTS/server/__init__.py
ADDED
File without changes
|
TTS/server/conf.json
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
|
3 |
+
"tts_file":"best_model.pth", // tts checkpoint file
|
4 |
+
"tts_config":"config.json", // tts config.json file
|
5 |
+
"tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
|
6 |
+
"vocoder_config":null,
|
7 |
+
"vocoder_file": null,
|
8 |
+
"is_wavernn_batched":true,
|
9 |
+
"port": 5002,
|
10 |
+
"use_cuda": true,
|
11 |
+
"debug": true
|
12 |
+
}
|
TTS/server/server.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!flask/bin/python
|
2 |
+
import argparse
|
3 |
+
import io
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
from pathlib import Path
|
8 |
+
from typing import Union
|
9 |
+
|
10 |
+
from flask import Flask, render_template, request, send_file
|
11 |
+
|
12 |
+
from TTS.config import load_config
|
13 |
+
from TTS.utils.manage import ModelManager
|
14 |
+
from TTS.utils.synthesizer import Synthesizer
|
15 |
+
|
16 |
+
|
17 |
+
def create_argparser():
|
18 |
+
def convert_boolean(x):
|
19 |
+
return x.lower() in ["true", "1", "yes"]
|
20 |
+
|
21 |
+
parser = argparse.ArgumentParser()
|
22 |
+
parser.add_argument(
|
23 |
+
"--list_models",
|
24 |
+
type=convert_boolean,
|
25 |
+
nargs="?",
|
26 |
+
const=True,
|
27 |
+
default=False,
|
28 |
+
help="list available pre-trained tts and vocoder models.",
|
29 |
+
)
|
30 |
+
parser.add_argument(
|
31 |
+
"--model_name",
|
32 |
+
type=str,
|
33 |
+
default="tts_models/en/ljspeech/tacotron2-DDC",
|
34 |
+
help="Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
|
35 |
+
)
|
36 |
+
parser.add_argument("--vocoder_name", type=str, default=None, help="name of one of the released vocoder models.")
|
37 |
+
|
38 |
+
# Args for running custom models
|
39 |
+
parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
|
40 |
+
parser.add_argument(
|
41 |
+
"--model_path",
|
42 |
+
type=str,
|
43 |
+
default=None,
|
44 |
+
help="Path to model file.",
|
45 |
+
)
|
46 |
+
parser.add_argument(
|
47 |
+
"--vocoder_path",
|
48 |
+
type=str,
|
49 |
+
help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
|
50 |
+
default=None,
|
51 |
+
)
|
52 |
+
parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
|
53 |
+
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
54 |
+
parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
|
55 |
+
parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.")
|
56 |
+
parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.")
|
57 |
+
parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.")
|
58 |
+
return parser
|
59 |
+
|
60 |
+
|
61 |
+
# parse the args
|
62 |
+
args = create_argparser().parse_args()
|
63 |
+
|
64 |
+
path = Path(__file__).parent / "../.models.json"
|
65 |
+
manager = ModelManager(path)
|
66 |
+
|
67 |
+
if args.list_models:
|
68 |
+
manager.list_models()
|
69 |
+
sys.exit()
|
70 |
+
|
71 |
+
# update in-use models to the specified released models.
|
72 |
+
model_path = None
|
73 |
+
config_path = None
|
74 |
+
speakers_file_path = None
|
75 |
+
vocoder_path = None
|
76 |
+
vocoder_config_path = None
|
77 |
+
|
78 |
+
# CASE1: list pre-trained TTS models
|
79 |
+
if args.list_models:
|
80 |
+
manager.list_models()
|
81 |
+
sys.exit()
|
82 |
+
|
83 |
+
# CASE2: load pre-trained model paths
|
84 |
+
if args.model_name is not None and not args.model_path:
|
85 |
+
model_path, config_path, model_item = manager.download_model(args.model_name)
|
86 |
+
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
|
87 |
+
|
88 |
+
if args.vocoder_name is not None and not args.vocoder_path:
|
89 |
+
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
|
90 |
+
|
91 |
+
# CASE3: set custom model paths
|
92 |
+
if args.model_path is not None:
|
93 |
+
model_path = args.model_path
|
94 |
+
config_path = args.config_path
|
95 |
+
speakers_file_path = args.speakers_file_path
|
96 |
+
|
97 |
+
if args.vocoder_path is not None:
|
98 |
+
vocoder_path = args.vocoder_path
|
99 |
+
vocoder_config_path = args.vocoder_config_path
|
100 |
+
|
101 |
+
# load models
|
102 |
+
synthesizer = Synthesizer(
|
103 |
+
tts_checkpoint=model_path,
|
104 |
+
tts_config_path=config_path,
|
105 |
+
tts_speakers_file=speakers_file_path,
|
106 |
+
tts_languages_file=None,
|
107 |
+
vocoder_checkpoint=vocoder_path,
|
108 |
+
vocoder_config=vocoder_config_path,
|
109 |
+
encoder_checkpoint="",
|
110 |
+
encoder_config="",
|
111 |
+
use_cuda=args.use_cuda,
|
112 |
+
)
|
113 |
+
|
114 |
+
use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
|
115 |
+
synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
|
116 |
+
)
|
117 |
+
|
118 |
+
speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
|
119 |
+
# TODO: set this from SpeakerManager
|
120 |
+
use_gst = synthesizer.tts_config.get("use_gst", False)
|
121 |
+
app = Flask(__name__)
|
122 |
+
|
123 |
+
|
124 |
+
def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
|
125 |
+
"""Transform an uri style_wav, in either a string (path to wav file to be use for style transfer)
|
126 |
+
or a dict (gst tokens/values to be use for styling)
|
127 |
+
|
128 |
+
Args:
|
129 |
+
style_wav (str): uri
|
130 |
+
|
131 |
+
Returns:
|
132 |
+
Union[str, dict]: path to file (str) or gst style (dict)
|
133 |
+
"""
|
134 |
+
if style_wav:
|
135 |
+
if os.path.isfile(style_wav) and style_wav.endswith(".wav"):
|
136 |
+
return style_wav # style_wav is a .wav file located on the server
|
137 |
+
|
138 |
+
style_wav = json.loads(style_wav)
|
139 |
+
return style_wav # style_wav is a gst dictionary with {token1_id : token1_weigth, ...}
|
140 |
+
return None
|
141 |
+
|
142 |
+
|
143 |
+
@app.route("/")
|
144 |
+
def index():
|
145 |
+
return render_template(
|
146 |
+
"index.html",
|
147 |
+
show_details=args.show_details,
|
148 |
+
use_multi_speaker=use_multi_speaker,
|
149 |
+
speaker_ids=speaker_manager.ids if speaker_manager is not None else None,
|
150 |
+
use_gst=use_gst,
|
151 |
+
)
|
152 |
+
|
153 |
+
|
154 |
+
@app.route("/details")
|
155 |
+
def details():
|
156 |
+
model_config = load_config(args.tts_config)
|
157 |
+
if args.vocoder_config is not None and os.path.isfile(args.vocoder_config):
|
158 |
+
vocoder_config = load_config(args.vocoder_config)
|
159 |
+
else:
|
160 |
+
vocoder_config = None
|
161 |
+
|
162 |
+
return render_template(
|
163 |
+
"details.html",
|
164 |
+
show_details=args.show_details,
|
165 |
+
model_config=model_config,
|
166 |
+
vocoder_config=vocoder_config,
|
167 |
+
args=args.__dict__,
|
168 |
+
)
|
169 |
+
|
170 |
+
|
171 |
+
@app.route("/api/tts", methods=["GET"])
|
172 |
+
def tts():
|
173 |
+
text = request.args.get("text")
|
174 |
+
speaker_idx = request.args.get("speaker_id", "")
|
175 |
+
style_wav = request.args.get("style_wav", "")
|
176 |
+
style_wav = style_wav_uri_to_dict(style_wav)
|
177 |
+
print(" > Model input: {}".format(text))
|
178 |
+
print(" > Speaker Idx: {}".format(speaker_idx))
|
179 |
+
wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
|
180 |
+
out = io.BytesIO()
|
181 |
+
synthesizer.save_wav(wavs, out)
|
182 |
+
return send_file(out, mimetype="audio/wav")
|
183 |
+
|
184 |
+
|
185 |
+
def main():
|
186 |
+
app.run(debug=args.debug, host="::", port=args.port)
|
187 |
+
|
188 |
+
|
189 |
+
if __name__ == "__main__":
|
190 |
+
main()
|
TTS/server/static/coqui-log-green-TTS.png
ADDED
![]() |
TTS/server/templates/details.html
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
|
4 |
+
<head>
|
5 |
+
|
6 |
+
<meta charset="utf-8">
|
7 |
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
8 |
+
<meta name="description" content="">
|
9 |
+
<meta name="author" content="">
|
10 |
+
|
11 |
+
<title>TTS engine</title>
|
12 |
+
|
13 |
+
<!-- Bootstrap core CSS -->
|
14 |
+
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
|
15 |
+
integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous"
|
16 |
+
rel="stylesheet">
|
17 |
+
|
18 |
+
<!-- Custom styles for this template -->
|
19 |
+
<style>
|
20 |
+
body {
|
21 |
+
padding-top: 54px;
|
22 |
+
}
|
23 |
+
|
24 |
+
@media (min-width: 992px) {
|
25 |
+
body {
|
26 |
+
padding-top: 56px;
|
27 |
+
}
|
28 |
+
}
|
29 |
+
</style>
|
30 |
+
</head>
|
31 |
+
|
32 |
+
<body>
|
33 |
+
<a href="https://github.com/mozilla/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
|
34 |
+
src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
|
35 |
+
|
36 |
+
{% if show_details == true %}
|
37 |
+
|
38 |
+
<div class="container">
|
39 |
+
<b>Model details</b>
|
40 |
+
</div>
|
41 |
+
|
42 |
+
<div class="container">
|
43 |
+
<details>
|
44 |
+
<summary>CLI arguments:</summary>
|
45 |
+
<table border="1" align="center" width="75%">
|
46 |
+
<tr>
|
47 |
+
<td> CLI key </td>
|
48 |
+
<td> Value </td>
|
49 |
+
</tr>
|
50 |
+
|
51 |
+
{% for key, value in args.items() %}
|
52 |
+
|
53 |
+
<tr>
|
54 |
+
<td>{{ key }}</td>
|
55 |
+
<td>{{ value }}</td>
|
56 |
+
</tr>
|
57 |
+
|
58 |
+
{% endfor %}
|
59 |
+
</table>
|
60 |
+
</details>
|
61 |
+
</div></br>
|
62 |
+
|
63 |
+
<div class="container">
|
64 |
+
|
65 |
+
{% if model_config != None %}
|
66 |
+
|
67 |
+
<details>
|
68 |
+
<summary>Model config:</summary>
|
69 |
+
|
70 |
+
<table border="1" align="center" width="75%">
|
71 |
+
<tr>
|
72 |
+
<td> Key </td>
|
73 |
+
<td> Value </td>
|
74 |
+
</tr>
|
75 |
+
|
76 |
+
|
77 |
+
{% for key, value in model_config.items() %}
|
78 |
+
|
79 |
+
<tr>
|
80 |
+
<td>{{ key }}</td>
|
81 |
+
<td>{{ value }}</td>
|
82 |
+
</tr>
|
83 |
+
|
84 |
+
{% endfor %}
|
85 |
+
|
86 |
+
</table>
|
87 |
+
</details>
|
88 |
+
|
89 |
+
{% endif %}
|
90 |
+
|
91 |
+
</div></br>
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
<div class="container">
|
96 |
+
{% if vocoder_config != None %}
|
97 |
+
<details>
|
98 |
+
<summary>Vocoder model config:</summary>
|
99 |
+
|
100 |
+
<table border="1" align="center" width="75%">
|
101 |
+
<tr>
|
102 |
+
<td> Key </td>
|
103 |
+
<td> Value </td>
|
104 |
+
</tr>
|
105 |
+
|
106 |
+
|
107 |
+
{% for key, value in vocoder_config.items() %}
|
108 |
+
|
109 |
+
<tr>
|
110 |
+
<td>{{ key }}</td>
|
111 |
+
<td>{{ value }}</td>
|
112 |
+
</tr>
|
113 |
+
|
114 |
+
{% endfor %}
|
115 |
+
|
116 |
+
|
117 |
+
</table>
|
118 |
+
</details>
|
119 |
+
{% endif %}
|
120 |
+
</div></br>
|
121 |
+
|
122 |
+
{% else %}
|
123 |
+
<div class="container">
|
124 |
+
<b>Please start server with --show_details=true to see details.</b>
|
125 |
+
</div>
|
126 |
+
|
127 |
+
{% endif %}
|
128 |
+
|
129 |
+
</body>
|
130 |
+
|
131 |
+
</html>
|
TTS/server/templates/index.html
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
|
4 |
+
<head>
|
5 |
+
|
6 |
+
<meta charset="utf-8">
|
7 |
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
8 |
+
<meta name="description" content="🐸Coqui AI TTS demo server.">
|
9 |
+
<meta name="author" content="🐸Coqui AI TTS">
|
10 |
+
|
11 |
+
<title>TTS engine</title>
|
12 |
+
|
13 |
+
<!-- Bootstrap core CSS -->
|
14 |
+
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
|
15 |
+
integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous"
|
16 |
+
rel="stylesheet">
|
17 |
+
|
18 |
+
<!-- Custom styles for this template -->
|
19 |
+
<style>
|
20 |
+
body {
|
21 |
+
padding-top: 54px;
|
22 |
+
}
|
23 |
+
|
24 |
+
@media (min-width: 992px) {
|
25 |
+
body {
|
26 |
+
padding-top: 56px;
|
27 |
+
}
|
28 |
+
}
|
29 |
+
</style>
|
30 |
+
</head>
|
31 |
+
|
32 |
+
<body>
|
33 |
+
<a href="https://github.com/coqui-ai/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
|
34 |
+
src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
|
35 |
+
|
36 |
+
<!-- Navigation -->
|
37 |
+
<!--
|
38 |
+
<nav class="navbar navbar-expand-lg navbar-dark bg-dark fixed-top">
|
39 |
+
<div class="container">
|
40 |
+
<a class="navbar-brand" href="#">Coqui TTS</a>
|
41 |
+
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
|
42 |
+
<span class="navbar-toggler-icon"></span>
|
43 |
+
</button>
|
44 |
+
<div class="collapse navbar-collapse" id="navbarResponsive">
|
45 |
+
<ul class="navbar-nav ml-auto">
|
46 |
+
<li class="nav-item active">
|
47 |
+
<a class="nav-link" href="#">Home
|
48 |
+
<span class="sr-only">(current)</span>
|
49 |
+
</a>
|
50 |
+
</li>
|
51 |
+
</ul>
|
52 |
+
</div>
|
53 |
+
</div>
|
54 |
+
</nav>
|
55 |
+
-->
|
56 |
+
|
57 |
+
<!-- Page Content -->
|
58 |
+
<div class="container">
|
59 |
+
<div class="row">
|
60 |
+
<div class="col-lg-12 text-center">
|
61 |
+
<img class="mt-5" src="{{url_for('static', filename='coqui-log-green-TTS.png')}}" align="middle"
|
62 |
+
width="512" />
|
63 |
+
|
64 |
+
<ul class="list-unstyled">
|
65 |
+
</ul>
|
66 |
+
|
67 |
+
{%if use_gst%}
|
68 |
+
<input value='{"0": 0.1}' id="style_wav" placeholder="style wav (dict or path ot wav).." size=45
|
69 |
+
type="text" name="style_wav">
|
70 |
+
{%endif%}
|
71 |
+
|
72 |
+
<input id="text" placeholder="Type here..." size=45 type="text" name="text">
|
73 |
+
<button id="speak-button" name="speak">Speak</button><br /><br />
|
74 |
+
|
75 |
+
{%if use_multi_speaker%}
|
76 |
+
Choose a speaker:
|
77 |
+
<select id="speaker_id" name=speaker_id method="GET" action="/">
|
78 |
+
{% for speaker_id in speaker_ids %}
|
79 |
+
<option value="{{speaker_id}}" SELECTED>{{speaker_id}}</option>"
|
80 |
+
{% endfor %}
|
81 |
+
</select><br /><br />
|
82 |
+
{%endif%}
|
83 |
+
|
84 |
+
{%if show_details%}
|
85 |
+
<button id="details-button" onclick="location.href = 'details'" name="model-details">Model
|
86 |
+
Details</button><br /><br />
|
87 |
+
{%endif%}
|
88 |
+
<audio id="audio" controls autoplay hidden></audio>
|
89 |
+
<p id="message"></p>
|
90 |
+
</div>
|
91 |
+
</div>
|
92 |
+
</div>
|
93 |
+
|
94 |
+
<!-- Bootstrap core JavaScript -->
|
95 |
+
<script>
|
96 |
+
function getTextValue(textId) {
|
97 |
+
const container = q(textId)
|
98 |
+
if (container) {
|
99 |
+
return container.value
|
100 |
+
}
|
101 |
+
return ""
|
102 |
+
}
|
103 |
+
function q(selector) { return document.querySelector(selector) }
|
104 |
+
q('#text').focus()
|
105 |
+
function do_tts(e) {
|
106 |
+
const text = q('#text').value
|
107 |
+
const speaker_id = getTextValue('#speaker_id')
|
108 |
+
const style_wav = getTextValue('#style_wav')
|
109 |
+
if (text) {
|
110 |
+
q('#message').textContent = 'Synthesizing...'
|
111 |
+
q('#speak-button').disabled = true
|
112 |
+
q('#audio').hidden = true
|
113 |
+
synthesize(text, speaker_id, style_wav)
|
114 |
+
}
|
115 |
+
e.preventDefault()
|
116 |
+
return false
|
117 |
+
}
|
118 |
+
q('#speak-button').addEventListener('click', do_tts)
|
119 |
+
q('#text').addEventListener('keyup', function (e) {
|
120 |
+
if (e.keyCode == 13) { // enter
|
121 |
+
do_tts(e)
|
122 |
+
}
|
123 |
+
})
|
124 |
+
function synthesize(text, speaker_id = "", style_wav = "") {
|
125 |
+
fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}`, { cache: 'no-cache' })
|
126 |
+
.then(function (res) {
|
127 |
+
if (!res.ok) throw Error(res.statusText)
|
128 |
+
return res.blob()
|
129 |
+
}).then(function (blob) {
|
130 |
+
q('#message').textContent = ''
|
131 |
+
q('#speak-button').disabled = false
|
132 |
+
q('#audio').src = URL.createObjectURL(blob)
|
133 |
+
q('#audio').hidden = false
|
134 |
+
}).catch(function (err) {
|
135 |
+
q('#message').textContent = 'Error: ' + err.message
|
136 |
+
q('#speak-button').disabled = false
|
137 |
+
})
|
138 |
+
}
|
139 |
+
</script>
|
140 |
+
|
141 |
+
</body>
|
142 |
+
|
143 |
+
</html>
|
TTS/tts/__init__.py
ADDED
File without changes
|
TTS/tts/configs/__init__.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import importlib
|
2 |
+
import os
|
3 |
+
from inspect import isclass
|
4 |
+
|
5 |
+
# import all files under configs/
|
6 |
+
# configs_dir = os.path.dirname(__file__)
|
7 |
+
# for file in os.listdir(configs_dir):
|
8 |
+
# path = os.path.join(configs_dir, file)
|
9 |
+
# if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
|
10 |
+
# config_name = file[: file.find(".py")] if file.endswith(".py") else file
|
11 |
+
# module = importlib.import_module("TTS.tts.configs." + config_name)
|
12 |
+
# for attribute_name in dir(module):
|
13 |
+
# attribute = getattr(module, attribute_name)
|
14 |
+
|
15 |
+
# if isclass(attribute):
|
16 |
+
# # Add the class to this package's variables
|
17 |
+
# globals()[attribute_name] = attribute
|
TTS/tts/configs/align_tts_config.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, field
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
5 |
+
from TTS.tts.models.align_tts import AlignTTSArgs
|
6 |
+
|
7 |
+
|
8 |
+
@dataclass
|
9 |
+
class AlignTTSConfig(BaseTTSConfig):
|
10 |
+
"""Defines parameters for AlignTTS model.
|
11 |
+
Example:
|
12 |
+
|
13 |
+
>>> from TTS.tts.configs.align_tts_config import AlignTTSConfig
|
14 |
+
>>> config = AlignTTSConfig()
|
15 |
+
|
16 |
+
Args:
|
17 |
+
model(str):
|
18 |
+
Model name used for selecting the right model at initialization. Defaults to `align_tts`.
|
19 |
+
positional_encoding (bool):
|
20 |
+
enable / disable positional encoding applied to the encoder output. Defaults to True.
|
21 |
+
hidden_channels (int):
|
22 |
+
Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
|
23 |
+
parameters. Defaults to 256.
|
24 |
+
hidden_channels_dp (int):
|
25 |
+
Number of hidden channels of the duration predictor's layers. Defaults to 256.
|
26 |
+
encoder_type (str):
|
27 |
+
Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
28 |
+
Defaults to `fftransformer`.
|
29 |
+
encoder_params (dict):
|
30 |
+
Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
31 |
+
Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
|
32 |
+
decoder_type (str):
|
33 |
+
Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
34 |
+
Defaults to `fftransformer`.
|
35 |
+
decoder_params (dict):
|
36 |
+
Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
37 |
+
Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
|
38 |
+
phase_start_steps (List[int]):
|
39 |
+
A list of number of steps required to start the next training phase. AlignTTS has 4 different training
|
40 |
+
phases. Thus you need to define 4 different values to enable phase based training. If None, it
|
41 |
+
trains the whole model together. Defaults to None.
|
42 |
+
ssim_alpha (float):
|
43 |
+
Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
|
44 |
+
duration_loss_alpha (float):
|
45 |
+
Weight for the duration predictor's loss. Defaults to 1.0.
|
46 |
+
mdn_alpha (float):
|
47 |
+
Weight for the MDN loss. Defaults to 1.0.
|
48 |
+
spec_loss_alpha (float):
|
49 |
+
Weight for the MSE spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
|
50 |
+
use_speaker_embedding (bool):
|
51 |
+
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
52 |
+
in the multi-speaker mode. Defaults to False.
|
53 |
+
use_d_vector_file (bool):
|
54 |
+
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
55 |
+
d_vector_file (str):
|
56 |
+
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
57 |
+
noam_schedule (bool):
|
58 |
+
enable / disable the use of Noam LR scheduler. Defaults to False.
|
59 |
+
warmup_steps (int):
|
60 |
+
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
61 |
+
lr (float):
|
62 |
+
Initial learning rate. Defaults to `1e-3`.
|
63 |
+
wd (float):
|
64 |
+
Weight decay coefficient. Defaults to `1e-7`.
|
65 |
+
min_seq_len (int):
|
66 |
+
Minimum input sequence length to be used at training.
|
67 |
+
max_seq_len (int):
|
68 |
+
Maximum input sequence length to be used at training. Larger values result in more VRAM usage."""
|
69 |
+
|
70 |
+
model: str = "align_tts"
|
71 |
+
# model specific params
|
72 |
+
model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs)
|
73 |
+
phase_start_steps: List[int] = None
|
74 |
+
|
75 |
+
ssim_alpha: float = 1.0
|
76 |
+
spec_loss_alpha: float = 1.0
|
77 |
+
dur_loss_alpha: float = 1.0
|
78 |
+
mdn_alpha: float = 1.0
|
79 |
+
|
80 |
+
# multi-speaker settings
|
81 |
+
use_speaker_embedding: bool = False
|
82 |
+
use_d_vector_file: bool = False
|
83 |
+
d_vector_file: str = False
|
84 |
+
|
85 |
+
# optimizer parameters
|
86 |
+
optimizer: str = "Adam"
|
87 |
+
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
|
88 |
+
lr_scheduler: str = None
|
89 |
+
lr_scheduler_params: dict = None
|
90 |
+
lr: float = 1e-4
|
91 |
+
grad_clip: float = 5.0
|
92 |
+
|
93 |
+
# overrides
|
94 |
+
min_seq_len: int = 13
|
95 |
+
max_seq_len: int = 200
|
96 |
+
r: int = 1
|
97 |
+
|
98 |
+
# testing
|
99 |
+
test_sentences: List[str] = field(
|
100 |
+
default_factory=lambda: [
|
101 |
+
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
102 |
+
"Be a voice, not an echo.",
|
103 |
+
"I'm sorry Dave. I'm afraid I can't do that.",
|
104 |
+
"This cake is great. It's so delicious and moist.",
|
105 |
+
"Prior to November 22, 1963.",
|
106 |
+
]
|
107 |
+
)
|
TTS/tts/configs/fast_pitch_config.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, field
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
5 |
+
from TTS.tts.models.forward_tts import ForwardTTSArgs
|
6 |
+
|
7 |
+
|
8 |
+
@dataclass
|
9 |
+
class FastPitchConfig(BaseTTSConfig):
|
10 |
+
"""Configure `ForwardTTS` as FastPitch model.
|
11 |
+
|
12 |
+
Example:
|
13 |
+
|
14 |
+
>>> from TTS.tts.configs.fast_pitch_config import FastPitchConfig
|
15 |
+
>>> config = FastPitchConfig()
|
16 |
+
|
17 |
+
Args:
|
18 |
+
model (str):
|
19 |
+
Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
|
20 |
+
|
21 |
+
base_model (str):
|
22 |
+
Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
|
23 |
+
the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
|
24 |
+
|
25 |
+
model_args (Coqpit):
|
26 |
+
Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
|
27 |
+
|
28 |
+
data_dep_init_steps (int):
|
29 |
+
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
30 |
+
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
31 |
+
for the rest. Defaults to 10.
|
32 |
+
|
33 |
+
speakers_file (str):
|
34 |
+
Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
|
35 |
+
speaker names. Defaults to `None`.
|
36 |
+
|
37 |
+
use_speaker_embedding (bool):
|
38 |
+
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
39 |
+
in the multi-speaker mode. Defaults to False.
|
40 |
+
|
41 |
+
use_d_vector_file (bool):
|
42 |
+
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
43 |
+
|
44 |
+
d_vector_file (str):
|
45 |
+
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
46 |
+
|
47 |
+
d_vector_dim (int):
|
48 |
+
Dimension of the external speaker embeddings. Defaults to 0.
|
49 |
+
|
50 |
+
optimizer (str):
|
51 |
+
Name of the model optimizer. Defaults to `Adam`.
|
52 |
+
|
53 |
+
optimizer_params (dict):
|
54 |
+
Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
|
55 |
+
|
56 |
+
lr_scheduler (str):
|
57 |
+
Name of the learning rate scheduler. Defaults to `Noam`.
|
58 |
+
|
59 |
+
lr_scheduler_params (dict):
|
60 |
+
Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
|
61 |
+
|
62 |
+
lr (float):
|
63 |
+
Initial learning rate. Defaults to `1e-3`.
|
64 |
+
|
65 |
+
grad_clip (float):
|
66 |
+
Gradient norm clipping value. Defaults to `5.0`.
|
67 |
+
|
68 |
+
spec_loss_type (str):
|
69 |
+
Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
|
70 |
+
|
71 |
+
duration_loss_type (str):
|
72 |
+
Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
|
73 |
+
|
74 |
+
use_ssim_loss (bool):
|
75 |
+
Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
|
76 |
+
|
77 |
+
wd (float):
|
78 |
+
Weight decay coefficient. Defaults to `1e-7`.
|
79 |
+
|
80 |
+
ssim_loss_alpha (float):
|
81 |
+
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
|
82 |
+
|
83 |
+
dur_loss_alpha (float):
|
84 |
+
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
|
85 |
+
|
86 |
+
spec_loss_alpha (float):
|
87 |
+
Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
|
88 |
+
|
89 |
+
pitch_loss_alpha (float):
|
90 |
+
Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
|
91 |
+
|
92 |
+
binary_align_loss_alpha (float):
|
93 |
+
Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
|
94 |
+
|
95 |
+
binary_loss_warmup_epochs (float):
|
96 |
+
Number of epochs to gradually increase the binary loss impact. Defaults to 150.
|
97 |
+
|
98 |
+
min_seq_len (int):
|
99 |
+
Minimum input sequence length to be used at training.
|
100 |
+
|
101 |
+
max_seq_len (int):
|
102 |
+
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
103 |
+
"""
|
104 |
+
|
105 |
+
model: str = "fast_pitch"
|
106 |
+
base_model: str = "forward_tts"
|
107 |
+
|
108 |
+
# model specific params
|
109 |
+
model_args: ForwardTTSArgs = ForwardTTSArgs()
|
110 |
+
|
111 |
+
# data loader params
|
112 |
+
return_wav: bool = False
|
113 |
+
compute_linear_spec: bool = False
|
114 |
+
|
115 |
+
# multi-speaker settings
|
116 |
+
num_speakers: int = 0
|
117 |
+
speakers_file: str = None
|
118 |
+
use_speaker_embedding: bool = False
|
119 |
+
use_d_vector_file: bool = False
|
120 |
+
d_vector_file: str = False
|
121 |
+
d_vector_dim: int = 0
|
122 |
+
|
123 |
+
# optimizer parameters
|
124 |
+
optimizer: str = "Adam"
|
125 |
+
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
|
126 |
+
lr_scheduler: str = "NoamLR"
|
127 |
+
lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
|
128 |
+
lr: float = 1e-4
|
129 |
+
grad_clip: float = 5.0
|
130 |
+
|
131 |
+
# loss params
|
132 |
+
spec_loss_type: str = "mse"
|
133 |
+
duration_loss_type: str = "mse"
|
134 |
+
use_ssim_loss: bool = True
|
135 |
+
ssim_loss_alpha: float = 1.0
|
136 |
+
spec_loss_alpha: float = 1.0
|
137 |
+
aligner_loss_alpha: float = 1.0
|
138 |
+
pitch_loss_alpha: float = 0.1
|
139 |
+
dur_loss_alpha: float = 0.1
|
140 |
+
binary_align_loss_alpha: float = 0.1
|
141 |
+
spk_encoder_loss_alpha: float = 0.1
|
142 |
+
binary_loss_warmup_epochs: int = 150
|
143 |
+
aligner_epochs: int = 1000
|
144 |
+
|
145 |
+
# overrides
|
146 |
+
min_seq_len: int = 13
|
147 |
+
max_seq_len: int = 200
|
148 |
+
r: int = 1 # DO NOT CHANGE
|
149 |
+
|
150 |
+
# dataset configs
|
151 |
+
compute_f0: bool = True
|
152 |
+
f0_cache_path: str = None
|
153 |
+
|
154 |
+
# testing
|
155 |
+
test_sentences: List[str] = field(
|
156 |
+
default_factory=lambda: [
|
157 |
+
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
158 |
+
"Be a voice, not an echo.",
|
159 |
+
"I'm sorry Dave. I'm afraid I can't do that.",
|
160 |
+
"This cake is great. It's so delicious and moist.",
|
161 |
+
"Prior to November 22, 1963.",
|
162 |
+
]
|
163 |
+
)
|
164 |
+
|
165 |
+
def __post_init__(self):
|
166 |
+
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
|
167 |
+
if self.num_speakers > 0:
|
168 |
+
self.model_args.num_speakers = self.num_speakers
|
169 |
+
|
170 |
+
# speaker embedding settings
|
171 |
+
if self.use_speaker_embedding:
|
172 |
+
self.model_args.use_speaker_embedding = True
|
173 |
+
if self.speakers_file:
|
174 |
+
self.model_args.speakers_file = self.speakers_file
|
175 |
+
|
176 |
+
# d-vector settings
|
177 |
+
if self.use_d_vector_file:
|
178 |
+
self.model_args.use_d_vector_file = True
|
179 |
+
if self.d_vector_dim is not None and self.d_vector_dim > 0:
|
180 |
+
self.model_args.d_vector_dim = self.d_vector_dim
|
181 |
+
if self.d_vector_file:
|
182 |
+
self.model_args.d_vector_file = self.d_vector_file
|