Spaces:
Build error
Build error
Rongjiehuang
commited on
Commit
·
222619b
1
Parent(s):
af80ff8
update
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- .gitignore +151 -0
- LICENSE +21 -0
- assets/0011_001570.TextGrid +156 -0
- assets/0011_001570.lab +1 -0
- assets/0011_001570.txt +1 -0
- assets/0011_001570.wav +0 -0
- checkpoints/Emotion_encoder.pt +3 -0
- checkpoints/GenerSpeech/config.yaml +249 -0
- checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt +3 -0
- checkpoints/trainset_hifigan/config.yaml +178 -0
- checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt +3 -0
- data/binary/training_set/mfa_dict.txt +0 -0
- data/binary/training_set/mfa_model.zip +3 -0
- data/binary/training_set/phone_set.json +1 -0
- data/binary/training_set/train_f0s_mean_std.npy +3 -0
- data/binary/training_set/word_set.json +1 -0
- data_gen/tts/base_binarizer.py +224 -0
- data_gen/tts/base_binarizer_emotion.py +352 -0
- data_gen/tts/base_preprocess.py +250 -0
- data_gen/tts/bin/binarize.py +20 -0
- data_gen/tts/bin/pre_align.py +20 -0
- data_gen/tts/bin/train_mfa_align.py +15 -0
- data_gen/tts/data_gen_utils.py +356 -0
- data_gen/tts/emotion/audio.py +107 -0
- data_gen/tts/emotion/inference.py +177 -0
- data_gen/tts/emotion/model.py +78 -0
- data_gen/tts/emotion/params_data.py +29 -0
- data_gen/tts/emotion/params_model.py +11 -0
- data_gen/tts/emotion/test_emotion.py +184 -0
- data_gen/tts/txt_processors/__init__.py +1 -0
- data_gen/tts/txt_processors/base_text_processor.py +47 -0
- data_gen/tts/txt_processors/en.py +77 -0
- data_gen/tts/wav_processors/__init__.py +2 -0
- data_gen/tts/wav_processors/base_processor.py +25 -0
- data_gen/tts/wav_processors/common_processors.py +86 -0
- egs/datasets/audio/emotion/base_text2mel.yaml +17 -0
- egs/datasets/audio/emotion/pre_align.py +25 -0
- egs/datasets/audio/libritts/base_text2mel.yaml +14 -0
- egs/datasets/audio/libritts/fs2.yaml +3 -0
- egs/datasets/audio/libritts/pre_align.py +21 -0
- egs/datasets/audio/libritts/pwg.yaml +8 -0
- egs/datasets/audio/lj/base_mel2wav.yaml +5 -0
- egs/datasets/audio/lj/pre_align.py +13 -0
- egs/datasets/audio/lj/pwg.yaml +3 -0
- egs/datasets/audio/vctk/base_mel2wav.yaml +3 -0
- egs/datasets/audio/vctk/fs2.yaml +12 -0
- egs/datasets/audio/vctk/pre_align.py +22 -0
- egs/datasets/audio/vctk/pwg.yaml +6 -0
- egs/egs_bases/config_base.yaml +46 -0
.gitattributes
CHANGED
@@ -31,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
31 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
32 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
33 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
31 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
32 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
33 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Project ignore
|
2 |
+
|
3 |
+
/ParallelWaveGAN
|
4 |
+
/wavegan_pretrained*
|
5 |
+
/pretrained_models
|
6 |
+
rsync
|
7 |
+
.idea
|
8 |
+
.DS_Store
|
9 |
+
bak
|
10 |
+
tmp
|
11 |
+
*.tar.gz
|
12 |
+
# mfa and kaldi
|
13 |
+
kaldi_align/exp
|
14 |
+
mfa
|
15 |
+
montreal-forced-aligner
|
16 |
+
mos
|
17 |
+
nbs
|
18 |
+
/configs_usr/*
|
19 |
+
!/configs_usr/.gitkeep
|
20 |
+
/fast_transformers
|
21 |
+
/rnnoise
|
22 |
+
/usr/*
|
23 |
+
!/usr/.gitkeep
|
24 |
+
|
25 |
+
# Created by .ignore support plugin (hsz.mobi)
|
26 |
+
### Python template
|
27 |
+
# Byte-compiled / optimized / DLL files
|
28 |
+
__pycache__/
|
29 |
+
*.py[cod]
|
30 |
+
*$py.class
|
31 |
+
|
32 |
+
# C extensions
|
33 |
+
*.so
|
34 |
+
|
35 |
+
# Distribution / packaging
|
36 |
+
.Python
|
37 |
+
build/
|
38 |
+
develop-eggs/
|
39 |
+
dist/
|
40 |
+
downloads/
|
41 |
+
eggs/
|
42 |
+
.eggs/
|
43 |
+
lib/
|
44 |
+
lib64/
|
45 |
+
parts/
|
46 |
+
sdist/
|
47 |
+
var/
|
48 |
+
wheels/
|
49 |
+
pip-wheel-metadata/
|
50 |
+
share/python-wheels/
|
51 |
+
*.egg-info/
|
52 |
+
.installed.cfg
|
53 |
+
*.egg
|
54 |
+
MANIFEST
|
55 |
+
|
56 |
+
# PyInstaller
|
57 |
+
# Usually these files are written by a python script from a template
|
58 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
59 |
+
*.manifest
|
60 |
+
*.spec
|
61 |
+
|
62 |
+
# Installer logs
|
63 |
+
pip-log.txt
|
64 |
+
pip-delete-this-directory.txt
|
65 |
+
|
66 |
+
# Unit test / coverage reports
|
67 |
+
htmlcov/
|
68 |
+
.tox/
|
69 |
+
.nox/
|
70 |
+
.coverage
|
71 |
+
.coverage.*
|
72 |
+
.cache
|
73 |
+
nosetests.xml
|
74 |
+
coverage.xml
|
75 |
+
*.cover
|
76 |
+
.hypothesis/
|
77 |
+
.pytest_cache/
|
78 |
+
|
79 |
+
# Translations
|
80 |
+
*.mo
|
81 |
+
*.pot
|
82 |
+
|
83 |
+
# Django stuff:
|
84 |
+
*.log
|
85 |
+
local_settings.py
|
86 |
+
db.sqlite3
|
87 |
+
db.sqlite3-journal
|
88 |
+
|
89 |
+
# Flask stuff:
|
90 |
+
instance/
|
91 |
+
.webassets-cache
|
92 |
+
|
93 |
+
# Scrapy stuff:
|
94 |
+
.scrapy
|
95 |
+
|
96 |
+
# Sphinx documentation
|
97 |
+
docs/_build/
|
98 |
+
|
99 |
+
# PyBuilder
|
100 |
+
target/
|
101 |
+
|
102 |
+
# Jupyter Notebook
|
103 |
+
.ipynb_checkpoints
|
104 |
+
|
105 |
+
# IPython
|
106 |
+
profile_default/
|
107 |
+
ipython_config.py
|
108 |
+
|
109 |
+
# pyenv
|
110 |
+
.python-version
|
111 |
+
|
112 |
+
# pipenv
|
113 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
114 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
115 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
116 |
+
# install all needed dependencies.
|
117 |
+
#Pipfile.lock
|
118 |
+
|
119 |
+
# celery beat schedule file
|
120 |
+
celerybeat-schedule
|
121 |
+
|
122 |
+
# SageMath parsed files
|
123 |
+
*.sage.py
|
124 |
+
|
125 |
+
# Environments
|
126 |
+
.env
|
127 |
+
.venv
|
128 |
+
env/
|
129 |
+
venv/
|
130 |
+
ENV/
|
131 |
+
env.bak/
|
132 |
+
venv.bak/
|
133 |
+
|
134 |
+
# Spyder project settings
|
135 |
+
.spyderproject
|
136 |
+
.spyproject
|
137 |
+
|
138 |
+
# Rope project settings
|
139 |
+
.ropeproject
|
140 |
+
|
141 |
+
# mkdocs documentation
|
142 |
+
/site
|
143 |
+
|
144 |
+
# mypy
|
145 |
+
.mypy_cache/
|
146 |
+
.dmypy.json
|
147 |
+
dmypy.json
|
148 |
+
|
149 |
+
# Pyre type checker
|
150 |
+
.pyre/
|
151 |
+
将删除 datasets/remi/test/
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2021 Jinglin Liu
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
assets/0011_001570.TextGrid
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
File type = "ooTextFile"
|
2 |
+
Object class = "TextGrid"
|
3 |
+
|
4 |
+
xmin = 0.0
|
5 |
+
xmax = 2.266
|
6 |
+
tiers? <exists>
|
7 |
+
size = 2
|
8 |
+
item []:
|
9 |
+
item [1]:
|
10 |
+
class = "IntervalTier"
|
11 |
+
name = "words"
|
12 |
+
xmin = 0.0
|
13 |
+
xmax = 2.266
|
14 |
+
intervals: size = 10
|
15 |
+
intervals [1]:
|
16 |
+
xmin = 0.000
|
17 |
+
xmax = 0.290
|
18 |
+
text = "sil"
|
19 |
+
intervals [2]:
|
20 |
+
xmin = 0.290
|
21 |
+
xmax = 0.320
|
22 |
+
text = ""
|
23 |
+
intervals [3]:
|
24 |
+
xmin = 0.320
|
25 |
+
xmax = 0.470
|
26 |
+
text = "b_ah1_t"
|
27 |
+
intervals [4]:
|
28 |
+
xmin = 0.470
|
29 |
+
xmax = 0.760
|
30 |
+
text = "ih1_f"
|
31 |
+
intervals [5]:
|
32 |
+
xmin = 0.760
|
33 |
+
xmax = 0.830
|
34 |
+
text = "y_uw1"
|
35 |
+
intervals [6]:
|
36 |
+
xmin = 0.830
|
37 |
+
xmax = 1.370
|
38 |
+
text = "hh_ae1_d_ah0_n_t"
|
39 |
+
intervals [7]:
|
40 |
+
xmin = 1.370
|
41 |
+
xmax = 1.600
|
42 |
+
text = "d_ah1_n"
|
43 |
+
intervals [8]:
|
44 |
+
xmin = 1.600
|
45 |
+
xmax = 1.900
|
46 |
+
text = "dh_eh1_m"
|
47 |
+
intervals [9]:
|
48 |
+
xmin = 1.900
|
49 |
+
xmax = 1.930
|
50 |
+
text = "sil"
|
51 |
+
intervals [10]:
|
52 |
+
xmin = 1.930
|
53 |
+
xmax = 2.266
|
54 |
+
text = ""
|
55 |
+
item [2]:
|
56 |
+
class = "IntervalTier"
|
57 |
+
name = "phones"
|
58 |
+
xmin = 0.0
|
59 |
+
xmax = 2.266
|
60 |
+
intervals: size = 24
|
61 |
+
intervals [1]:
|
62 |
+
xmin = 0.000
|
63 |
+
xmax = 0.290
|
64 |
+
text = "SIL"
|
65 |
+
intervals [2]:
|
66 |
+
xmin = 0.290
|
67 |
+
xmax = 0.320
|
68 |
+
text = "sp"
|
69 |
+
intervals [3]:
|
70 |
+
xmin = 0.320
|
71 |
+
xmax = 0.400
|
72 |
+
text = "B"
|
73 |
+
intervals [4]:
|
74 |
+
xmin = 0.400
|
75 |
+
xmax = 0.440
|
76 |
+
text = "AH1"
|
77 |
+
intervals [5]:
|
78 |
+
xmin = 0.440
|
79 |
+
xmax = 0.470
|
80 |
+
text = "T"
|
81 |
+
intervals [6]:
|
82 |
+
xmin = 0.470
|
83 |
+
xmax = 0.530
|
84 |
+
text = "IH1"
|
85 |
+
intervals [7]:
|
86 |
+
xmin = 0.530
|
87 |
+
xmax = 0.760
|
88 |
+
text = "F"
|
89 |
+
intervals [8]:
|
90 |
+
xmin = 0.760
|
91 |
+
xmax = 0.800
|
92 |
+
text = "Y"
|
93 |
+
intervals [9]:
|
94 |
+
xmin = 0.800
|
95 |
+
xmax = 0.830
|
96 |
+
text = "UW1"
|
97 |
+
intervals [10]:
|
98 |
+
xmin = 0.830
|
99 |
+
xmax = 0.980
|
100 |
+
text = "HH"
|
101 |
+
intervals [11]:
|
102 |
+
xmin = 0.980
|
103 |
+
xmax = 1.180
|
104 |
+
text = "AE1"
|
105 |
+
intervals [12]:
|
106 |
+
xmin = 1.180
|
107 |
+
xmax = 1.220
|
108 |
+
text = "D"
|
109 |
+
intervals [13]:
|
110 |
+
xmin = 1.220
|
111 |
+
xmax = 1.250
|
112 |
+
text = "AH0"
|
113 |
+
intervals [14]:
|
114 |
+
xmin = 1.250
|
115 |
+
xmax = 1.340
|
116 |
+
text = "N"
|
117 |
+
intervals [15]:
|
118 |
+
xmin = 1.340
|
119 |
+
xmax = 1.370
|
120 |
+
text = "T"
|
121 |
+
intervals [16]:
|
122 |
+
xmin = 1.370
|
123 |
+
xmax = 1.410
|
124 |
+
text = "D"
|
125 |
+
intervals [17]:
|
126 |
+
xmin = 1.410
|
127 |
+
xmax = 1.550
|
128 |
+
text = "AH1"
|
129 |
+
intervals [18]:
|
130 |
+
xmin = 1.550
|
131 |
+
xmax = 1.600
|
132 |
+
text = "N"
|
133 |
+
intervals [19]:
|
134 |
+
xmin = 1.600
|
135 |
+
xmax = 1.650
|
136 |
+
text = "DH"
|
137 |
+
intervals [20]:
|
138 |
+
xmin = 1.650
|
139 |
+
xmax = 1.800
|
140 |
+
text = "EH1"
|
141 |
+
intervals [21]:
|
142 |
+
xmin = 1.800
|
143 |
+
xmax = 1.900
|
144 |
+
text = "M"
|
145 |
+
intervals [22]:
|
146 |
+
xmin = 1.900
|
147 |
+
xmax = 1.930
|
148 |
+
text = "SIL"
|
149 |
+
intervals [23]:
|
150 |
+
xmin = 1.930
|
151 |
+
xmax = 2.250
|
152 |
+
text = "sp"
|
153 |
+
intervals [24]:
|
154 |
+
xmin = 2.250
|
155 |
+
xmax = 2.266
|
156 |
+
text = ""
|
assets/0011_001570.lab
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
SIL B_AH1_T IH1_F Y_UW1 HH_AE1_D_AH0_N_T D_AH1_N DH_EH1_M SIL
|
assets/0011_001570.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
But if you hadn't done them.
|
assets/0011_001570.wav
ADDED
Binary file (72.6 kB). View file
|
|
checkpoints/Emotion_encoder.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f9de4930cbd8e5ba51efdef84c326e3728a5482dd7668f82960e4cb0f97cc8e5
|
3 |
+
size 17095350
|
checkpoints/GenerSpeech/config.yaml
ADDED
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accumulate_grad_batches: 1
|
2 |
+
amp: false
|
3 |
+
audio_num_mel_bins: 80
|
4 |
+
audio_sample_rate: 16000
|
5 |
+
base_config:
|
6 |
+
- egs/egs_bases/tts/fs2_adv.yaml
|
7 |
+
- egs/datasets/audio/emotion/base_text2mel.yaml
|
8 |
+
binarization_args:
|
9 |
+
reset_phone_dict: true
|
10 |
+
reset_word_dict: true
|
11 |
+
shuffle: true
|
12 |
+
trim_eos_bos: false
|
13 |
+
trim_sil: false
|
14 |
+
with_align: true
|
15 |
+
with_f0: true
|
16 |
+
with_f0cwt: false
|
17 |
+
with_linear: false
|
18 |
+
with_spk_embed: true
|
19 |
+
with_spk_id: true
|
20 |
+
with_txt: true
|
21 |
+
with_wav: true
|
22 |
+
with_word: true
|
23 |
+
binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
|
24 |
+
binary_data_dir: data/binary/training_set
|
25 |
+
check_val_every_n_epoch: 10
|
26 |
+
clip_grad_norm: 1
|
27 |
+
clip_grad_value: 0
|
28 |
+
conv_use_pos: false
|
29 |
+
crop: false
|
30 |
+
cwt_add_f0_loss: false
|
31 |
+
cwt_hidden_size: 128
|
32 |
+
cwt_layers: 2
|
33 |
+
cwt_loss: l1
|
34 |
+
cwt_std_scale: 0.8
|
35 |
+
debug: false
|
36 |
+
dec_dilations:
|
37 |
+
- 1
|
38 |
+
- 1
|
39 |
+
- 1
|
40 |
+
- 1
|
41 |
+
dec_ffn_kernel_size: 9
|
42 |
+
dec_inp_add_noise: false
|
43 |
+
dec_kernel_size: 5
|
44 |
+
dec_layers: 4
|
45 |
+
dec_num_heads: 2
|
46 |
+
decoder_rnn_dim: 0
|
47 |
+
decoder_type: fft
|
48 |
+
dict_dir: ''
|
49 |
+
disc_hidden_size: 128
|
50 |
+
disc_interval: 1
|
51 |
+
disc_lr: 0.0001
|
52 |
+
disc_norm: in
|
53 |
+
disc_reduction: stack
|
54 |
+
disc_start_steps: 0
|
55 |
+
disc_win_num: 3
|
56 |
+
discriminator_grad_norm: 1
|
57 |
+
discriminator_optimizer_params:
|
58 |
+
eps: 1.0e-06
|
59 |
+
weight_decay: 0.0
|
60 |
+
discriminator_scheduler_params:
|
61 |
+
gamma: 0.5
|
62 |
+
step_size: 60000
|
63 |
+
dropout: 0.05
|
64 |
+
ds_workers: 2
|
65 |
+
dur_enc_hidden_stride_kernel:
|
66 |
+
- 0,2,3
|
67 |
+
- 0,2,3
|
68 |
+
- 0,1,3
|
69 |
+
dur_loss: mse
|
70 |
+
dur_predictor_kernel: 3
|
71 |
+
dur_predictor_layers: 2
|
72 |
+
emotion_encoder_path: /home1/huangrongjie/Project/Emotion_encoder/1121_emotion_encoder.pt
|
73 |
+
enc_dec_norm: ln
|
74 |
+
enc_dilations:
|
75 |
+
- 1
|
76 |
+
- 1
|
77 |
+
- 1
|
78 |
+
- 1
|
79 |
+
enc_ffn_kernel_size: 9
|
80 |
+
enc_kernel_size: 5
|
81 |
+
enc_layers: 4
|
82 |
+
encoder_K: 8
|
83 |
+
encoder_type: fft
|
84 |
+
endless_ds: true
|
85 |
+
ffn_act: gelu
|
86 |
+
ffn_hidden_size: 1024
|
87 |
+
ffn_padding: SAME
|
88 |
+
fft_size: 1024
|
89 |
+
fmax: 7600
|
90 |
+
fmin: 80
|
91 |
+
forcing: 20000
|
92 |
+
frames_multiple: 1
|
93 |
+
gen_dir_name: ''
|
94 |
+
generator_grad_norm: 5.0
|
95 |
+
griffin_lim_iters: 60
|
96 |
+
hidden_size: 256
|
97 |
+
hop_size: 256
|
98 |
+
infer: false
|
99 |
+
lambda_commit: 0.25
|
100 |
+
lambda_energy: 0.1
|
101 |
+
lambda_f0: 1.0
|
102 |
+
lambda_mel_adv: 0.1
|
103 |
+
lambda_ph_dur: 0.1
|
104 |
+
lambda_sent_dur: 1.0
|
105 |
+
lambda_uv: 1.0
|
106 |
+
lambda_word_dur: 1.0
|
107 |
+
layers_in_block: 2
|
108 |
+
load_ckpt: ''
|
109 |
+
loud_norm: false
|
110 |
+
lr: 1.0
|
111 |
+
max_epochs: 1000
|
112 |
+
max_frames: 1548
|
113 |
+
max_input_tokens: 1550
|
114 |
+
max_sentences: 100000
|
115 |
+
max_tokens: 30000
|
116 |
+
max_updates: 300000
|
117 |
+
max_valid_sentences: 1
|
118 |
+
max_valid_tokens: 60000
|
119 |
+
mel_disc_hidden_size: 128
|
120 |
+
mel_gan: true
|
121 |
+
mel_hidden_size: 256
|
122 |
+
mel_loss: ssim:0.5|l1:0.5
|
123 |
+
mel_vmax: 1.5
|
124 |
+
mel_vmin: -6
|
125 |
+
min_frames: 128
|
126 |
+
min_level_db: -100
|
127 |
+
nVQ: 128
|
128 |
+
noise_scale: 0.8
|
129 |
+
num_ckpt_keep: 2
|
130 |
+
num_heads: 2
|
131 |
+
num_sanity_val_steps: -1
|
132 |
+
num_spk: 500
|
133 |
+
num_test_samples: 72
|
134 |
+
num_valid_plots: 10
|
135 |
+
optimizer_adam_beta1: 0.5
|
136 |
+
optimizer_adam_beta2: 0.999
|
137 |
+
out_wav_norm: false
|
138 |
+
pitch_ar: false
|
139 |
+
pitch_embed_type: 0
|
140 |
+
pitch_enc_hidden_stride_kernel:
|
141 |
+
- 0,2,5
|
142 |
+
- 0,2,5
|
143 |
+
- 0,2,5
|
144 |
+
pitch_extractor: parselmouth
|
145 |
+
pitch_loss: l1
|
146 |
+
pitch_norm: standard
|
147 |
+
pitch_ssim_win: 11
|
148 |
+
pitch_type: frame
|
149 |
+
post_glow_hidden: 128
|
150 |
+
post_glow_kernel_size: 3
|
151 |
+
post_glow_n_block_layers: 3
|
152 |
+
post_glow_n_blocks: 8
|
153 |
+
post_share_cond_layers: false
|
154 |
+
pre_align_args:
|
155 |
+
allow_no_txt: false
|
156 |
+
denoise: false
|
157 |
+
sox_resample: false
|
158 |
+
sox_to_wav: false
|
159 |
+
trim_sil: false
|
160 |
+
txt_processor: en
|
161 |
+
use_tone: true
|
162 |
+
pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
|
163 |
+
predictor_dropout: 0.5
|
164 |
+
predictor_grad: 1.0
|
165 |
+
predictor_hidden: -1
|
166 |
+
predictor_kernel: 5
|
167 |
+
predictor_layers: 2
|
168 |
+
preprocess_args:
|
169 |
+
add_eos_bos: true
|
170 |
+
mfa_group_shuffle: false
|
171 |
+
mfa_offset: 0.02
|
172 |
+
nsample_per_mfa_group: 1000
|
173 |
+
reset_phone_dict: true
|
174 |
+
reset_word_dict: true
|
175 |
+
save_sil_mask: true
|
176 |
+
txt_processor: en
|
177 |
+
use_mfa: true
|
178 |
+
vad_max_silence_length: 12
|
179 |
+
wav_processors: []
|
180 |
+
with_phsep: true
|
181 |
+
preprocess_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
|
182 |
+
pretrain_fs_ckpt: ''
|
183 |
+
print_nan_grads: false
|
184 |
+
processed_data_dir: data/processed/emotion
|
185 |
+
profile_infer: false
|
186 |
+
raw_data_dir: data/raw/ESD
|
187 |
+
ref_audio: ''
|
188 |
+
ref_hidden_stride_kernel:
|
189 |
+
- 0,3,5
|
190 |
+
- 0,3,5
|
191 |
+
- 0,2,5
|
192 |
+
- 0,2,5
|
193 |
+
- 0,2,5
|
194 |
+
ref_level_db: 20
|
195 |
+
ref_norm_layer: bn
|
196 |
+
rename_tmux: true
|
197 |
+
rerun_gen: false
|
198 |
+
resume_from_checkpoint: 0
|
199 |
+
save_best: false
|
200 |
+
save_codes: []
|
201 |
+
save_f0: false
|
202 |
+
save_gt: true
|
203 |
+
scheduler: rsqrt
|
204 |
+
seed: 1234
|
205 |
+
share_wn_layers: 4
|
206 |
+
sigmoid_scale: false
|
207 |
+
sil_add_noise: false
|
208 |
+
sort_by_len: true
|
209 |
+
task_cls: modules.GenerSpeech.task.generspeech.GenerSpeechTask
|
210 |
+
tb_log_interval: 100
|
211 |
+
test_ids: []
|
212 |
+
test_input_dir: ''
|
213 |
+
test_num: 200
|
214 |
+
test_set_name: test
|
215 |
+
text: ''
|
216 |
+
train_set_name: train
|
217 |
+
train_sets: ''
|
218 |
+
use_cond_disc: false
|
219 |
+
use_emotion: true
|
220 |
+
use_energy_embed: false
|
221 |
+
use_gt_dur: false
|
222 |
+
use_gt_f0: false
|
223 |
+
use_latent_cond: true
|
224 |
+
use_pitch_embed: true
|
225 |
+
use_pos_embed: true
|
226 |
+
use_ref_enc: false
|
227 |
+
use_spk_embed: true
|
228 |
+
use_spk_id: false
|
229 |
+
use_split_spk_id: false
|
230 |
+
use_txt_cond: true
|
231 |
+
use_uv: true
|
232 |
+
use_var_enc: false
|
233 |
+
use_word: true
|
234 |
+
vae_dropout: 0.0
|
235 |
+
val_check_interval: 2000
|
236 |
+
valid_infer_interval: 10000
|
237 |
+
valid_monitor_key: val_loss
|
238 |
+
valid_monitor_mode: min
|
239 |
+
valid_set_name: valid
|
240 |
+
var_enc_vq_codes: 64
|
241 |
+
vocoder: hifigan
|
242 |
+
vocoder_ckpt: checkpoints/trainset_hifigan
|
243 |
+
vocoder_denoise_c: 0.0
|
244 |
+
vq_start: 20500
|
245 |
+
warmup_updates: 2000
|
246 |
+
weight_decay: 0
|
247 |
+
win_size: 1024
|
248 |
+
word_size: 30000
|
249 |
+
work_dir: checkpoints/GenerSpeech_release4
|
checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b872bb686013cee2a98cc610b8b66b788c46ff4c33130682b63af4ac005405ea
|
3 |
+
size 619582860
|
checkpoints/trainset_hifigan/config.yaml
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accumulate_grad_batches: 1
|
2 |
+
adam_b1: 0.8
|
3 |
+
adam_b2: 0.99
|
4 |
+
amp: false
|
5 |
+
audio_num_mel_bins: 80
|
6 |
+
audio_sample_rate: 16000
|
7 |
+
aux_context_window: 0
|
8 |
+
base_config:
|
9 |
+
- egs/egs_bases/tts/vocoder/hifigan.yaml
|
10 |
+
- egs/datasets/audio/emotion/base_text2mel.yaml
|
11 |
+
binarization_args:
|
12 |
+
reset_phone_dict: true
|
13 |
+
reset_word_dict: true
|
14 |
+
shuffle: true
|
15 |
+
trim_eos_bos: false
|
16 |
+
trim_sil: false
|
17 |
+
with_align: false
|
18 |
+
with_f0: true
|
19 |
+
with_f0cwt: false
|
20 |
+
with_linear: false
|
21 |
+
with_spk_embed: false
|
22 |
+
with_spk_id: true
|
23 |
+
with_txt: false
|
24 |
+
with_wav: true
|
25 |
+
with_word: false
|
26 |
+
binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
|
27 |
+
binary_data_dir: data/binary/training_set
|
28 |
+
check_val_every_n_epoch: 10
|
29 |
+
clip_grad_norm: 1
|
30 |
+
clip_grad_value: 0
|
31 |
+
debug: false
|
32 |
+
dec_ffn_kernel_size: 9
|
33 |
+
dec_layers: 4
|
34 |
+
dict_dir: ''
|
35 |
+
disc_start_steps: 40000
|
36 |
+
discriminator_grad_norm: 1
|
37 |
+
discriminator_optimizer_params:
|
38 |
+
lr: 0.0002
|
39 |
+
discriminator_scheduler_params:
|
40 |
+
gamma: 0.999
|
41 |
+
step_size: 600
|
42 |
+
dropout: 0.1
|
43 |
+
ds_workers: 1
|
44 |
+
enc_ffn_kernel_size: 9
|
45 |
+
enc_layers: 4
|
46 |
+
endless_ds: true
|
47 |
+
ffn_act: gelu
|
48 |
+
ffn_padding: SAME
|
49 |
+
fft_size: 1024
|
50 |
+
fmax: 7600
|
51 |
+
fmin: 80
|
52 |
+
frames_multiple: 1
|
53 |
+
gen_dir_name: ''
|
54 |
+
generator_grad_norm: 10
|
55 |
+
generator_optimizer_params:
|
56 |
+
lr: 0.0002
|
57 |
+
generator_scheduler_params:
|
58 |
+
gamma: 0.999
|
59 |
+
step_size: 600
|
60 |
+
griffin_lim_iters: 60
|
61 |
+
hidden_size: 256
|
62 |
+
hop_size: 256
|
63 |
+
infer: false
|
64 |
+
lambda_adv: 1.0
|
65 |
+
lambda_cdisc: 4.0
|
66 |
+
lambda_mel: 5.0
|
67 |
+
lambda_mel_adv: 1.0
|
68 |
+
load_ckpt: ''
|
69 |
+
loud_norm: false
|
70 |
+
lr: 2.0
|
71 |
+
max_epochs: 1000
|
72 |
+
max_frames: 1548
|
73 |
+
max_input_tokens: 1550
|
74 |
+
max_samples: 8192
|
75 |
+
max_sentences: 24
|
76 |
+
max_tokens: 30000
|
77 |
+
max_updates: 1000000
|
78 |
+
max_valid_sentences: 1
|
79 |
+
max_valid_tokens: 60000
|
80 |
+
mel_loss: ssim:0.5|l1:0.5
|
81 |
+
mel_vmax: 1.5
|
82 |
+
mel_vmin: -6
|
83 |
+
min_frames: 128
|
84 |
+
min_level_db: -100
|
85 |
+
num_ckpt_keep: 3
|
86 |
+
num_heads: 2
|
87 |
+
num_mels: 80
|
88 |
+
num_sanity_val_steps: -1
|
89 |
+
num_spk: 10
|
90 |
+
num_test_samples: 30
|
91 |
+
num_valid_plots: 10
|
92 |
+
optimizer_adam_beta1: 0.9
|
93 |
+
optimizer_adam_beta2: 0.98
|
94 |
+
out_wav_norm: false
|
95 |
+
pitch_extractor: parselmouth
|
96 |
+
pitch_type: frame
|
97 |
+
pre_align_args:
|
98 |
+
allow_no_txt: false
|
99 |
+
denoise: false
|
100 |
+
sox_resample: false
|
101 |
+
sox_to_wav: false
|
102 |
+
trim_sil: false
|
103 |
+
txt_processor: en
|
104 |
+
use_tone: true
|
105 |
+
pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
|
106 |
+
print_nan_grads: false
|
107 |
+
processed_data_dir: data/processed/emotion,data/processed/LibriTTS
|
108 |
+
profile_infer: false
|
109 |
+
raw_data_dir: data/raw/ESD
|
110 |
+
ref_level_db: 20
|
111 |
+
rename_tmux: true
|
112 |
+
resblock: '1'
|
113 |
+
resblock_dilation_sizes:
|
114 |
+
- - 1
|
115 |
+
- 3
|
116 |
+
- 5
|
117 |
+
- - 1
|
118 |
+
- 3
|
119 |
+
- 5
|
120 |
+
- - 1
|
121 |
+
- 3
|
122 |
+
- 5
|
123 |
+
resblock_kernel_sizes:
|
124 |
+
- 3
|
125 |
+
- 7
|
126 |
+
- 11
|
127 |
+
resume_from_checkpoint: 0
|
128 |
+
save_best: true
|
129 |
+
save_codes: []
|
130 |
+
save_f0: false
|
131 |
+
save_gt: true
|
132 |
+
scheduler: rsqrt
|
133 |
+
seed: 1234
|
134 |
+
sort_by_len: true
|
135 |
+
task_cls: tasks.vocoder.hifigan.HifiGanTask
|
136 |
+
tb_log_interval: 100
|
137 |
+
test_ids: []
|
138 |
+
test_input_dir: ''
|
139 |
+
test_num: 200
|
140 |
+
test_set_name: test
|
141 |
+
train_set_name: train
|
142 |
+
train_sets: ''
|
143 |
+
upsample_initial_channel: 512
|
144 |
+
upsample_kernel_sizes:
|
145 |
+
- 16
|
146 |
+
- 16
|
147 |
+
- 4
|
148 |
+
- 4
|
149 |
+
upsample_rates:
|
150 |
+
- 8
|
151 |
+
- 8
|
152 |
+
- 2
|
153 |
+
- 2
|
154 |
+
use_cdisc: false
|
155 |
+
use_cond_disc: false
|
156 |
+
use_emotion: true
|
157 |
+
use_fm_loss: false
|
158 |
+
use_ms_stft: false
|
159 |
+
use_pitch_embed: false
|
160 |
+
use_spec_disc: false
|
161 |
+
use_spk_embed: false
|
162 |
+
use_spk_id: true
|
163 |
+
use_split_spk_id: false
|
164 |
+
val_check_interval: 2000
|
165 |
+
valid_infer_interval: 10000
|
166 |
+
valid_monitor_key: val_loss
|
167 |
+
valid_monitor_mode: min
|
168 |
+
valid_set_name: valid
|
169 |
+
vocoder: pwg
|
170 |
+
vocoder_ckpt: ''
|
171 |
+
vocoder_denoise_c: 0.0
|
172 |
+
warmup_updates: 8000
|
173 |
+
weight_decay: 0
|
174 |
+
win_length: null
|
175 |
+
win_size: 1024
|
176 |
+
window: hann
|
177 |
+
word_size: 30000
|
178 |
+
work_dir: checkpoints/trainset_hifigan
|
checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a2577919899400a111ef42a2aba65797d282c259d083d2c276539dda9d17870
|
3 |
+
size 1016199247
|
data/binary/training_set/mfa_dict.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/binary/training_set/mfa_model.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:71dc26b9aba3529892eebc21088db2b8eee41c89d87085c24148cf96b029a62c
|
3 |
+
size 23850075
|
data/binary/training_set/phone_set.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]
|
data/binary/training_set/train_f0s_mean_std.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8489ff2f4fd60c6a445b35f0a5a6629923880abebe11ff6ead6c2ebd4bfe28f5
|
3 |
+
size 144
|
data/binary/training_set/word_set.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[".", "the", ",", "a", "and", "of", "her", "these", "with", "is", "its", "we", "to", "at", "things", "for", "rainbow", "into", "as", "end", "will", "she", "can", "please", "call", "stella", "take", "shape", "long", "round", "arch", "path", "high", "above", "two", "ends", "apparently", "beyond", "horizon", "six", "spoons", "fresh", "snow", "peas", "five", "thick", "slabs", "blue", "cheese", "maybe", "snack", "brother", "bob", "i", "there", "according", "legend", "boiling", "pot", "gold", "one", "when", "sunlight", "strikes", "raindrops", "in", "air", "they", "act", "prism", "form", "ask", "bring", "from", "store", "scoop", "three", "red", "bags", "go", "meet", "wednesday", "train", "station", "also", "need", "small", "plastic", "snake", "big", "toy", "frog", "kids", "division", "white", "light", "many", "beautiful", "colors", "you", "your", "say", "he", "have", "be", "just", "know", "because", "was", "man", "infinite", "resource", "sagacity", "shouldnt", "pricked", "him", "horn", "all", "this", "won", "by", "our", "labour", "neither", "yea", "nor", "nay", "but", "if", "hadnt", "done", "them", "emperor", "no", "admittance", "except", "on", "party", "business", "smiled", "calmly", "mother", "knows", "that", "best", "smile", "id", "soon", "swim", "way", "others", "do", "searched", "through", "box", "name", "more", "hilarious", "?", "words", "behind", "ears", "nonsense", "tom", "fell", "cloven", "head", "vowed", "hed", "change", "pigtails", "place", "shall", "good", "bye", "part", "fish", "mouth", "chew", "leaves", "quickly", "said", "rabbit", "pay", "half", "crown", "week", "extra", "daisy", "creams", "pink", "edges"]
|
data_gen/tts/base_binarizer.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
3 |
+
|
4 |
+
from utils.multiprocess_utils import chunked_multiprocess_run
|
5 |
+
import random
|
6 |
+
import traceback
|
7 |
+
import json
|
8 |
+
from resemblyzer import VoiceEncoder
|
9 |
+
from tqdm import tqdm
|
10 |
+
from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder
|
11 |
+
from utils.hparams import set_hparams, hparams
|
12 |
+
import numpy as np
|
13 |
+
from utils.indexed_datasets import IndexedDatasetBuilder
|
14 |
+
from vocoders.base_vocoder import VOCODERS
|
15 |
+
import pandas as pd
|
16 |
+
|
17 |
+
|
18 |
+
class BinarizationError(Exception):
|
19 |
+
pass
|
20 |
+
|
21 |
+
|
22 |
+
class BaseBinarizer:
|
23 |
+
def __init__(self, processed_data_dir=None):
|
24 |
+
if processed_data_dir is None:
|
25 |
+
processed_data_dir = hparams['processed_data_dir']
|
26 |
+
self.processed_data_dirs = processed_data_dir.split(",")
|
27 |
+
self.binarization_args = hparams['binarization_args']
|
28 |
+
self.pre_align_args = hparams['pre_align_args']
|
29 |
+
self.forced_align = self.pre_align_args['forced_align']
|
30 |
+
tg_dir = None
|
31 |
+
if self.forced_align == 'mfa':
|
32 |
+
tg_dir = 'mfa_outputs'
|
33 |
+
if self.forced_align == 'kaldi':
|
34 |
+
tg_dir = 'kaldi_outputs'
|
35 |
+
self.item2txt = {}
|
36 |
+
self.item2ph = {}
|
37 |
+
self.item2wavfn = {}
|
38 |
+
self.item2tgfn = {}
|
39 |
+
self.item2spk = {}
|
40 |
+
for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
|
41 |
+
self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
|
42 |
+
for r_idx, r in self.meta_df.iterrows():
|
43 |
+
item_name = raw_item_name = r['item_name']
|
44 |
+
if len(self.processed_data_dirs) > 1:
|
45 |
+
item_name = f'ds{ds_id}_{item_name}'
|
46 |
+
self.item2txt[item_name] = r['txt']
|
47 |
+
self.item2ph[item_name] = r['ph']
|
48 |
+
self.item2wavfn[item_name] = os.path.join(hparams['raw_data_dir'], 'wavs', os.path.basename(r['wav_fn']).split('_')[1])
|
49 |
+
self.item2spk[item_name] = r.get('spk', 'SPK1')
|
50 |
+
if len(self.processed_data_dirs) > 1:
|
51 |
+
self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
|
52 |
+
if tg_dir is not None:
|
53 |
+
self.item2tgfn[item_name] = f"{processed_data_dir}/{tg_dir}/{raw_item_name}.TextGrid"
|
54 |
+
self.item_names = sorted(list(self.item2txt.keys()))
|
55 |
+
if self.binarization_args['shuffle']:
|
56 |
+
random.seed(1234)
|
57 |
+
random.shuffle(self.item_names)
|
58 |
+
|
59 |
+
@property
|
60 |
+
def train_item_names(self):
|
61 |
+
return self.item_names[hparams['test_num']+hparams['valid_num']:]
|
62 |
+
|
63 |
+
@property
|
64 |
+
def valid_item_names(self):
|
65 |
+
return self.item_names[0: hparams['test_num']+hparams['valid_num']] #
|
66 |
+
|
67 |
+
@property
|
68 |
+
def test_item_names(self):
|
69 |
+
return self.item_names[0: hparams['test_num']] # Audios for MOS testing are in 'test_ids'
|
70 |
+
|
71 |
+
def build_spk_map(self):
|
72 |
+
spk_map = set()
|
73 |
+
for item_name in self.item_names:
|
74 |
+
spk_name = self.item2spk[item_name]
|
75 |
+
spk_map.add(spk_name)
|
76 |
+
spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
|
77 |
+
assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
|
78 |
+
return spk_map
|
79 |
+
|
80 |
+
def item_name2spk_id(self, item_name):
|
81 |
+
return self.spk_map[self.item2spk[item_name]]
|
82 |
+
|
83 |
+
def _phone_encoder(self):
|
84 |
+
ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
|
85 |
+
ph_set = []
|
86 |
+
if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
|
87 |
+
for processed_data_dir in self.processed_data_dirs:
|
88 |
+
ph_set += [x.split(' ')[0] for x in open(f'{processed_data_dir}/dict.txt').readlines()]
|
89 |
+
ph_set = sorted(set(ph_set))
|
90 |
+
json.dump(ph_set, open(ph_set_fn, 'w'))
|
91 |
+
else:
|
92 |
+
ph_set = json.load(open(ph_set_fn, 'r'))
|
93 |
+
print("| phone set: ", ph_set)
|
94 |
+
return build_phone_encoder(hparams['binary_data_dir'])
|
95 |
+
|
96 |
+
def meta_data(self, prefix):
|
97 |
+
if prefix == 'valid':
|
98 |
+
item_names = self.valid_item_names
|
99 |
+
elif prefix == 'test':
|
100 |
+
item_names = self.test_item_names
|
101 |
+
else:
|
102 |
+
item_names = self.train_item_names
|
103 |
+
for item_name in item_names:
|
104 |
+
ph = self.item2ph[item_name]
|
105 |
+
txt = self.item2txt[item_name]
|
106 |
+
tg_fn = self.item2tgfn.get(item_name)
|
107 |
+
wav_fn = self.item2wavfn[item_name]
|
108 |
+
spk_id = self.item_name2spk_id(item_name)
|
109 |
+
yield item_name, ph, txt, tg_fn, wav_fn, spk_id
|
110 |
+
|
111 |
+
def process(self):
|
112 |
+
os.makedirs(hparams['binary_data_dir'], exist_ok=True)
|
113 |
+
self.spk_map = self.build_spk_map()
|
114 |
+
print("| spk_map: ", self.spk_map)
|
115 |
+
spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
|
116 |
+
json.dump(self.spk_map, open(spk_map_fn, 'w'))
|
117 |
+
|
118 |
+
self.phone_encoder = self._phone_encoder()
|
119 |
+
self.process_data('valid')
|
120 |
+
self.process_data('test')
|
121 |
+
self.process_data('train')
|
122 |
+
|
123 |
+
def process_data(self, prefix):
|
124 |
+
data_dir = hparams['binary_data_dir']
|
125 |
+
args = []
|
126 |
+
builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
|
127 |
+
lengths = []
|
128 |
+
f0s = []
|
129 |
+
total_sec = 0
|
130 |
+
if self.binarization_args['with_spk_embed']:
|
131 |
+
voice_encoder = VoiceEncoder().cuda()
|
132 |
+
|
133 |
+
meta_data = list(self.meta_data(prefix))
|
134 |
+
for m in meta_data:
|
135 |
+
args.append(list(m) + [self.phone_encoder, self.binarization_args])
|
136 |
+
num_workers = int(os.getenv('N_PROC', os.cpu_count() // 3))
|
137 |
+
for f_id, (_, item) in enumerate(
|
138 |
+
zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
|
139 |
+
if item is None:
|
140 |
+
continue
|
141 |
+
item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
|
142 |
+
if self.binarization_args['with_spk_embed'] else None
|
143 |
+
if not self.binarization_args['with_wav'] and 'wav' in item:
|
144 |
+
print("del wav")
|
145 |
+
del item['wav']
|
146 |
+
builder.add_item(item)
|
147 |
+
lengths.append(item['len'])
|
148 |
+
total_sec += item['sec']
|
149 |
+
if item.get('f0') is not None:
|
150 |
+
f0s.append(item['f0'])
|
151 |
+
builder.finalize()
|
152 |
+
np.save(f'{data_dir}/{prefix}_lengths.npy', lengths)
|
153 |
+
if len(f0s) > 0:
|
154 |
+
f0s = np.concatenate(f0s, 0)
|
155 |
+
f0s = f0s[f0s != 0]
|
156 |
+
np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
|
157 |
+
print(f"| {prefix} total duration: {total_sec:.3f}s")
|
158 |
+
|
159 |
+
@classmethod
|
160 |
+
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
|
161 |
+
if hparams['vocoder'] in VOCODERS:
|
162 |
+
wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
|
163 |
+
else:
|
164 |
+
wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
|
165 |
+
res = {
|
166 |
+
'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
|
167 |
+
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
|
168 |
+
}
|
169 |
+
try:
|
170 |
+
if binarization_args['with_f0']:
|
171 |
+
cls.get_pitch(wav, mel, res)
|
172 |
+
if binarization_args['with_f0cwt']:
|
173 |
+
cls.get_f0cwt(res['f0'], res)
|
174 |
+
if binarization_args['with_txt']:
|
175 |
+
try:
|
176 |
+
phone_encoded = res['phone'] = encoder.encode(ph)
|
177 |
+
except:
|
178 |
+
traceback.print_exc()
|
179 |
+
raise BinarizationError(f"Empty phoneme")
|
180 |
+
if binarization_args['with_align']:
|
181 |
+
cls.get_align(tg_fn, ph, mel, phone_encoded, res)
|
182 |
+
except BinarizationError as e:
|
183 |
+
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
|
184 |
+
return None
|
185 |
+
return res
|
186 |
+
|
187 |
+
@staticmethod
|
188 |
+
def get_align(tg_fn, ph, mel, phone_encoded, res):
|
189 |
+
if tg_fn is not None and os.path.exists(tg_fn):
|
190 |
+
mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
|
191 |
+
else:
|
192 |
+
raise BinarizationError(f"Align not found")
|
193 |
+
if mel2ph.max() - 1 >= len(phone_encoded):
|
194 |
+
raise BinarizationError(
|
195 |
+
f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
|
196 |
+
res['mel2ph'] = mel2ph
|
197 |
+
res['dur'] = dur
|
198 |
+
|
199 |
+
@staticmethod
|
200 |
+
def get_pitch(wav, mel, res):
|
201 |
+
f0, pitch_coarse = get_pitch(wav, mel, hparams)
|
202 |
+
if sum(f0) == 0:
|
203 |
+
raise BinarizationError("Empty f0")
|
204 |
+
res['f0'] = f0
|
205 |
+
res['pitch'] = pitch_coarse
|
206 |
+
|
207 |
+
@staticmethod
|
208 |
+
def get_f0cwt(f0, res):
|
209 |
+
from utils.cwt import get_cont_lf0, get_lf0_cwt
|
210 |
+
uv, cont_lf0_lpf = get_cont_lf0(f0)
|
211 |
+
logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
|
212 |
+
cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
|
213 |
+
Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
|
214 |
+
if np.any(np.isnan(Wavelet_lf0)):
|
215 |
+
raise BinarizationError("NaN CWT")
|
216 |
+
res['cwt_spec'] = Wavelet_lf0
|
217 |
+
res['cwt_scales'] = scales
|
218 |
+
res['f0_mean'] = logf0s_mean_org
|
219 |
+
res['f0_std'] = logf0s_std_org
|
220 |
+
|
221 |
+
|
222 |
+
if __name__ == "__main__":
|
223 |
+
set_hparams()
|
224 |
+
BaseBinarizer().process()
|
data_gen/tts/base_binarizer_emotion.py
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
4 |
+
import torch
|
5 |
+
from collections import Counter
|
6 |
+
from utils.text_encoder import TokenTextEncoder
|
7 |
+
from data_gen.tts.emotion import inference as EmotionEncoder
|
8 |
+
from data_gen.tts.emotion.inference import embed_utterance as Embed_utterance
|
9 |
+
from data_gen.tts.emotion.inference import preprocess_wav
|
10 |
+
from utils.multiprocess_utils import chunked_multiprocess_run
|
11 |
+
import random
|
12 |
+
import traceback
|
13 |
+
import json
|
14 |
+
from resemblyzer import VoiceEncoder
|
15 |
+
from tqdm import tqdm
|
16 |
+
from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder, is_sil_phoneme
|
17 |
+
from utils.hparams import hparams, set_hparams
|
18 |
+
import numpy as np
|
19 |
+
from utils.indexed_datasets import IndexedDatasetBuilder
|
20 |
+
from vocoders.base_vocoder import get_vocoder_cls
|
21 |
+
import pandas as pd
|
22 |
+
|
23 |
+
|
24 |
+
class BinarizationError(Exception):
|
25 |
+
pass
|
26 |
+
|
27 |
+
|
28 |
+
class EmotionBinarizer:
|
29 |
+
def __init__(self, processed_data_dir=None):
|
30 |
+
if processed_data_dir is None:
|
31 |
+
processed_data_dir = hparams['processed_data_dir']
|
32 |
+
self.processed_data_dirs = processed_data_dir.split(",")
|
33 |
+
self.binarization_args = hparams['binarization_args']
|
34 |
+
self.pre_align_args = hparams['pre_align_args']
|
35 |
+
self.item2txt = {}
|
36 |
+
self.item2ph = {}
|
37 |
+
self.item2wavfn = {}
|
38 |
+
self.item2tgfn = {}
|
39 |
+
self.item2spk = {}
|
40 |
+
self.item2emo = {}
|
41 |
+
|
42 |
+
def load_meta_data(self):
|
43 |
+
for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
|
44 |
+
self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
|
45 |
+
for r_idx, r in tqdm(self.meta_df.iterrows(), desc='Loading meta data.'):
|
46 |
+
item_name = raw_item_name = r['item_name']
|
47 |
+
if len(self.processed_data_dirs) > 1:
|
48 |
+
item_name = f'ds{ds_id}_{item_name}'
|
49 |
+
self.item2txt[item_name] = r['txt']
|
50 |
+
self.item2ph[item_name] = r['ph']
|
51 |
+
self.item2wavfn[item_name] = r['wav_fn']
|
52 |
+
self.item2spk[item_name] = r.get('spk', 'SPK1') \
|
53 |
+
if self.binarization_args['with_spk_id'] else 'SPK1'
|
54 |
+
if len(self.processed_data_dirs) > 1:
|
55 |
+
self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
|
56 |
+
self.item2tgfn[item_name] = f"{processed_data_dir}/mfa_outputs/{raw_item_name}.TextGrid"
|
57 |
+
self.item2emo[item_name] = r.get('others', '"Neutral"')
|
58 |
+
self.item_names = sorted(list(self.item2txt.keys()))
|
59 |
+
if self.binarization_args['shuffle']:
|
60 |
+
random.seed(1234)
|
61 |
+
random.shuffle(self.item_names)
|
62 |
+
|
63 |
+
@property
|
64 |
+
def train_item_names(self):
|
65 |
+
return self.item_names[hparams['test_num']:]
|
66 |
+
|
67 |
+
@property
|
68 |
+
def valid_item_names(self):
|
69 |
+
return self.item_names[:hparams['test_num']]
|
70 |
+
|
71 |
+
@property
|
72 |
+
def test_item_names(self):
|
73 |
+
return self.valid_item_names
|
74 |
+
|
75 |
+
def build_spk_map(self):
|
76 |
+
spk_map = set()
|
77 |
+
for item_name in self.item_names:
|
78 |
+
spk_name = self.item2spk[item_name]
|
79 |
+
spk_map.add(spk_name)
|
80 |
+
spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
|
81 |
+
print("| #Spk: ", len(spk_map))
|
82 |
+
assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
|
83 |
+
return spk_map
|
84 |
+
|
85 |
+
def build_emo_map(self):
|
86 |
+
emo_map = set()
|
87 |
+
for item_name in self.item_names:
|
88 |
+
emo_name = self.item2emo[item_name]
|
89 |
+
emo_map.add(emo_name)
|
90 |
+
emo_map = {x: i for i, x in enumerate(sorted(list(emo_map)))}
|
91 |
+
print("| #Emo: ", len(emo_map))
|
92 |
+
return emo_map
|
93 |
+
|
94 |
+
def item_name2spk_id(self, item_name):
|
95 |
+
return self.spk_map[self.item2spk[item_name]]
|
96 |
+
|
97 |
+
def item_name2emo_id(self, item_name):
|
98 |
+
return self.emo_map[self.item2emo[item_name]]
|
99 |
+
|
100 |
+
def _phone_encoder(self):
|
101 |
+
ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
|
102 |
+
ph_set = []
|
103 |
+
if self.binarization_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
|
104 |
+
for ph_sent in self.item2ph.values():
|
105 |
+
ph_set += ph_sent.split(' ')
|
106 |
+
ph_set = sorted(set(ph_set))
|
107 |
+
json.dump(ph_set, open(ph_set_fn, 'w'))
|
108 |
+
print("| Build phone set: ", ph_set)
|
109 |
+
else:
|
110 |
+
ph_set = json.load(open(ph_set_fn, 'r'))
|
111 |
+
print("| Load phone set: ", ph_set)
|
112 |
+
return build_phone_encoder(hparams['binary_data_dir'])
|
113 |
+
|
114 |
+
def _word_encoder(self):
|
115 |
+
fn = f"{hparams['binary_data_dir']}/word_set.json"
|
116 |
+
word_set = []
|
117 |
+
if self.binarization_args['reset_word_dict']:
|
118 |
+
for word_sent in self.item2txt.values():
|
119 |
+
word_set += [x for x in word_sent.split(' ') if x != '']
|
120 |
+
word_set = Counter(word_set)
|
121 |
+
total_words = sum(word_set.values())
|
122 |
+
word_set = word_set.most_common(hparams['word_size'])
|
123 |
+
num_unk_words = total_words - sum([x[1] for x in word_set])
|
124 |
+
word_set = [x[0] for x in word_set]
|
125 |
+
json.dump(word_set, open(fn, 'w'))
|
126 |
+
print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
|
127 |
+
f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
|
128 |
+
else:
|
129 |
+
word_set = json.load(open(fn, 'r'))
|
130 |
+
print("| Load word set. Size: ", len(word_set), word_set[:10])
|
131 |
+
return TokenTextEncoder(None, vocab_list=word_set, replace_oov='<UNK>')
|
132 |
+
|
133 |
+
def meta_data(self, prefix):
|
134 |
+
if prefix == 'valid':
|
135 |
+
item_names = self.valid_item_names
|
136 |
+
elif prefix == 'test':
|
137 |
+
item_names = self.test_item_names
|
138 |
+
else:
|
139 |
+
item_names = self.train_item_names
|
140 |
+
for item_name in item_names:
|
141 |
+
ph = self.item2ph[item_name]
|
142 |
+
txt = self.item2txt[item_name]
|
143 |
+
tg_fn = self.item2tgfn.get(item_name)
|
144 |
+
wav_fn = self.item2wavfn[item_name]
|
145 |
+
spk_id = self.item_name2spk_id(item_name)
|
146 |
+
emotion = self.item_name2emo_id(item_name)
|
147 |
+
yield item_name, ph, txt, tg_fn, wav_fn, spk_id, emotion
|
148 |
+
|
149 |
+
def process(self):
|
150 |
+
self.load_meta_data()
|
151 |
+
os.makedirs(hparams['binary_data_dir'], exist_ok=True)
|
152 |
+
self.spk_map = self.build_spk_map()
|
153 |
+
print("| spk_map: ", self.spk_map)
|
154 |
+
spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
|
155 |
+
json.dump(self.spk_map, open(spk_map_fn, 'w'))
|
156 |
+
|
157 |
+
self.emo_map = self.build_emo_map()
|
158 |
+
print("| emo_map: ", self.emo_map)
|
159 |
+
emo_map_fn = f"{hparams['binary_data_dir']}/emo_map.json"
|
160 |
+
json.dump(self.emo_map, open(emo_map_fn, 'w'))
|
161 |
+
|
162 |
+
self.phone_encoder = self._phone_encoder()
|
163 |
+
self.word_encoder = None
|
164 |
+
EmotionEncoder.load_model(hparams['emotion_encoder_path'])
|
165 |
+
|
166 |
+
if self.binarization_args['with_word']:
|
167 |
+
self.word_encoder = self._word_encoder()
|
168 |
+
self.process_data('valid')
|
169 |
+
self.process_data('test')
|
170 |
+
self.process_data('train')
|
171 |
+
|
172 |
+
def process_data(self, prefix):
|
173 |
+
data_dir = hparams['binary_data_dir']
|
174 |
+
args = []
|
175 |
+
builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
|
176 |
+
ph_lengths = []
|
177 |
+
mel_lengths = []
|
178 |
+
f0s = []
|
179 |
+
total_sec = 0
|
180 |
+
if self.binarization_args['with_spk_embed']:
|
181 |
+
voice_encoder = VoiceEncoder().cuda()
|
182 |
+
|
183 |
+
meta_data = list(self.meta_data(prefix))
|
184 |
+
for m in meta_data:
|
185 |
+
args.append(list(m) + [(self.phone_encoder, self.word_encoder), self.binarization_args])
|
186 |
+
num_workers = self.num_workers
|
187 |
+
for f_id, (_, item) in enumerate(
|
188 |
+
zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
|
189 |
+
if item is None:
|
190 |
+
continue
|
191 |
+
item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
|
192 |
+
if self.binarization_args['with_spk_embed'] else None
|
193 |
+
processed_wav = preprocess_wav(item['wav_fn'])
|
194 |
+
item['emo_embed'] = Embed_utterance(processed_wav)
|
195 |
+
if not self.binarization_args['with_wav'] and 'wav' in item:
|
196 |
+
del item['wav']
|
197 |
+
builder.add_item(item)
|
198 |
+
mel_lengths.append(item['len'])
|
199 |
+
if 'ph_len' in item:
|
200 |
+
ph_lengths.append(item['ph_len'])
|
201 |
+
total_sec += item['sec']
|
202 |
+
if item.get('f0') is not None:
|
203 |
+
f0s.append(item['f0'])
|
204 |
+
builder.finalize()
|
205 |
+
np.save(f'{data_dir}/{prefix}_lengths.npy', mel_lengths)
|
206 |
+
if len(ph_lengths) > 0:
|
207 |
+
np.save(f'{data_dir}/{prefix}_ph_lengths.npy', ph_lengths)
|
208 |
+
if len(f0s) > 0:
|
209 |
+
f0s = np.concatenate(f0s, 0)
|
210 |
+
f0s = f0s[f0s != 0]
|
211 |
+
np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
|
212 |
+
print(f"| {prefix} total duration: {total_sec:.3f}s")
|
213 |
+
|
214 |
+
@classmethod
|
215 |
+
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, emotion, encoder, binarization_args):
|
216 |
+
res = {'item_name': item_name, 'txt': txt, 'ph': ph, 'wav_fn': wav_fn, 'spk_id': spk_id, 'emotion': emotion}
|
217 |
+
if binarization_args['with_linear']:
|
218 |
+
wav, mel, linear_stft = get_vocoder_cls(hparams).wav2spec(wav_fn) # , return_linear=True
|
219 |
+
res['linear'] = linear_stft
|
220 |
+
else:
|
221 |
+
wav, mel = get_vocoder_cls(hparams).wav2spec(wav_fn)
|
222 |
+
wav = wav.astype(np.float16)
|
223 |
+
res.update({'mel': mel, 'wav': wav,
|
224 |
+
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]})
|
225 |
+
try:
|
226 |
+
if binarization_args['with_f0']:
|
227 |
+
cls.get_pitch(res)
|
228 |
+
if binarization_args['with_f0cwt']:
|
229 |
+
cls.get_f0cwt(res)
|
230 |
+
if binarization_args['with_txt']:
|
231 |
+
ph_encoder, word_encoder = encoder
|
232 |
+
try:
|
233 |
+
res['phone'] = ph_encoder.encode(ph)
|
234 |
+
res['ph_len'] = len(res['phone'])
|
235 |
+
except:
|
236 |
+
traceback.print_exc()
|
237 |
+
raise BinarizationError(f"Empty phoneme")
|
238 |
+
if binarization_args['with_align']:
|
239 |
+
cls.get_align(tg_fn, res)
|
240 |
+
if binarization_args['trim_eos_bos']:
|
241 |
+
bos_dur = res['dur'][0]
|
242 |
+
eos_dur = res['dur'][-1]
|
243 |
+
res['mel'] = mel[bos_dur:-eos_dur]
|
244 |
+
res['f0'] = res['f0'][bos_dur:-eos_dur]
|
245 |
+
res['pitch'] = res['pitch'][bos_dur:-eos_dur]
|
246 |
+
res['mel2ph'] = res['mel2ph'][bos_dur:-eos_dur]
|
247 |
+
res['wav'] = wav[bos_dur * hparams['hop_size']:-eos_dur * hparams['hop_size']]
|
248 |
+
res['dur'] = res['dur'][1:-1]
|
249 |
+
res['len'] = res['mel'].shape[0]
|
250 |
+
if binarization_args['with_word']:
|
251 |
+
cls.get_word(res, word_encoder)
|
252 |
+
except BinarizationError as e:
|
253 |
+
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
|
254 |
+
return None
|
255 |
+
except Exception as e:
|
256 |
+
traceback.print_exc()
|
257 |
+
print(f"| Skip item. item_name: {item_name}, wav_fn: {wav_fn}")
|
258 |
+
return None
|
259 |
+
return res
|
260 |
+
|
261 |
+
@staticmethod
|
262 |
+
def get_align(tg_fn, res):
|
263 |
+
ph = res['ph']
|
264 |
+
mel = res['mel']
|
265 |
+
phone_encoded = res['phone']
|
266 |
+
if tg_fn is not None and os.path.exists(tg_fn):
|
267 |
+
mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
|
268 |
+
else:
|
269 |
+
raise BinarizationError(f"Align not found")
|
270 |
+
if mel2ph.max() - 1 >= len(phone_encoded):
|
271 |
+
raise BinarizationError(
|
272 |
+
f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
|
273 |
+
res['mel2ph'] = mel2ph
|
274 |
+
res['dur'] = dur
|
275 |
+
|
276 |
+
@staticmethod
|
277 |
+
def get_pitch(res):
|
278 |
+
wav, mel = res['wav'], res['mel']
|
279 |
+
f0, pitch_coarse = get_pitch(wav, mel, hparams)
|
280 |
+
if sum(f0) == 0:
|
281 |
+
raise BinarizationError("Empty f0")
|
282 |
+
res['f0'] = f0
|
283 |
+
res['pitch'] = pitch_coarse
|
284 |
+
|
285 |
+
@staticmethod
|
286 |
+
def get_f0cwt(res):
|
287 |
+
from utils.cwt import get_cont_lf0, get_lf0_cwt
|
288 |
+
f0 = res['f0']
|
289 |
+
uv, cont_lf0_lpf = get_cont_lf0(f0)
|
290 |
+
logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
|
291 |
+
cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
|
292 |
+
Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
|
293 |
+
if np.any(np.isnan(Wavelet_lf0)):
|
294 |
+
raise BinarizationError("NaN CWT")
|
295 |
+
res['cwt_spec'] = Wavelet_lf0
|
296 |
+
res['cwt_scales'] = scales
|
297 |
+
res['f0_mean'] = logf0s_mean_org
|
298 |
+
res['f0_std'] = logf0s_std_org
|
299 |
+
|
300 |
+
@staticmethod
|
301 |
+
def get_word(res, word_encoder):
|
302 |
+
ph_split = res['ph'].split(" ")
|
303 |
+
# ph side mapping to word
|
304 |
+
ph_words = [] # ['<BOS>', 'N_AW1_', ',', 'AE1_Z_|', 'AO1_L_|', 'B_UH1_K_S_|', 'N_AA1_T_|', ....]
|
305 |
+
ph2word = np.zeros([len(ph_split)], dtype=int)
|
306 |
+
last_ph_idx_for_word = [] # [2, 11, ...]
|
307 |
+
for i, ph in enumerate(ph_split):
|
308 |
+
if ph == '|':
|
309 |
+
last_ph_idx_for_word.append(i)
|
310 |
+
elif not ph[0].isalnum():
|
311 |
+
if ph not in ['<BOS>']:
|
312 |
+
last_ph_idx_for_word.append(i - 1)
|
313 |
+
last_ph_idx_for_word.append(i)
|
314 |
+
start_ph_idx_for_word = [0] + [i + 1 for i in last_ph_idx_for_word[:-1]]
|
315 |
+
for i, (s_w, e_w) in enumerate(zip(start_ph_idx_for_word, last_ph_idx_for_word)):
|
316 |
+
ph_words.append(ph_split[s_w:e_w + 1])
|
317 |
+
ph2word[s_w:e_w + 1] = i
|
318 |
+
ph2word = ph2word.tolist()
|
319 |
+
ph_words = ["_".join(w) for w in ph_words]
|
320 |
+
|
321 |
+
# mel side mapping to word
|
322 |
+
mel2word = []
|
323 |
+
dur_word = [0 for _ in range(len(ph_words))]
|
324 |
+
for i, m2p in enumerate(res['mel2ph']):
|
325 |
+
word_idx = ph2word[m2p - 1]
|
326 |
+
mel2word.append(ph2word[m2p - 1])
|
327 |
+
dur_word[word_idx] += 1
|
328 |
+
ph2word = [x + 1 for x in ph2word] # 0预留给padding
|
329 |
+
mel2word = [x + 1 for x in mel2word] # 0预留给padding
|
330 |
+
res['ph_words'] = ph_words # [T_word]
|
331 |
+
res['ph2word'] = ph2word # [T_ph]
|
332 |
+
res['mel2word'] = mel2word # [T_mel]
|
333 |
+
res['dur_word'] = dur_word # [T_word]
|
334 |
+
words = [x for x in res['txt'].split(" ") if x != '']
|
335 |
+
while len(words) > 0 and is_sil_phoneme(words[0]):
|
336 |
+
words = words[1:]
|
337 |
+
while len(words) > 0 and is_sil_phoneme(words[-1]):
|
338 |
+
words = words[:-1]
|
339 |
+
words = ['<BOS>'] + words + ['<EOS>']
|
340 |
+
word_tokens = word_encoder.encode(" ".join(words))
|
341 |
+
res['words'] = words
|
342 |
+
res['word_tokens'] = word_tokens
|
343 |
+
assert len(words) == len(ph_words), [words, ph_words]
|
344 |
+
|
345 |
+
@property
|
346 |
+
def num_workers(self):
|
347 |
+
return int(os.getenv('N_PROC', hparams.get('N_PROC', os.cpu_count())))
|
348 |
+
|
349 |
+
|
350 |
+
if __name__ == "__main__":
|
351 |
+
set_hparams()
|
352 |
+
EmotionBinarizer().process()
|
data_gen/tts/base_preprocess.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import re
|
5 |
+
import traceback
|
6 |
+
from collections import Counter
|
7 |
+
from functools import partial
|
8 |
+
|
9 |
+
import librosa
|
10 |
+
from tqdm import tqdm
|
11 |
+
from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
|
12 |
+
from data_gen.tts.wav_processors.base_processor import get_wav_processor_cls
|
13 |
+
from utils.hparams import hparams
|
14 |
+
from utils.multiprocess_utils import multiprocess_run_tqdm
|
15 |
+
from utils.os_utils import link_file, move_file, remove_file
|
16 |
+
from data_gen.tts.data_gen_utils import is_sil_phoneme, build_token_encoder
|
17 |
+
|
18 |
+
|
19 |
+
class BasePreprocessor:
|
20 |
+
def __init__(self):
|
21 |
+
self.preprocess_args = hparams['preprocess_args']
|
22 |
+
txt_processor = self.preprocess_args['txt_processor']
|
23 |
+
self.txt_processor = get_txt_processor_cls(txt_processor)
|
24 |
+
self.raw_data_dir = hparams['raw_data_dir']
|
25 |
+
self.processed_dir = hparams['processed_data_dir']
|
26 |
+
self.spk_map_fn = f"{self.processed_dir}/spk_map.json"
|
27 |
+
|
28 |
+
def meta_data(self):
|
29 |
+
"""
|
30 |
+
:return: {'item_name': Str, 'wav_fn': Str, 'txt': Str, 'spk_name': Str, 'txt_loader': None or Func}
|
31 |
+
"""
|
32 |
+
raise NotImplementedError
|
33 |
+
|
34 |
+
def process(self):
|
35 |
+
processed_dir = self.processed_dir
|
36 |
+
wav_processed_tmp_dir = f'{processed_dir}/processed_tmp'
|
37 |
+
remove_file(wav_processed_tmp_dir)
|
38 |
+
os.makedirs(wav_processed_tmp_dir, exist_ok=True)
|
39 |
+
wav_processed_dir = f'{processed_dir}/{self.wav_processed_dirname}'
|
40 |
+
remove_file(wav_processed_dir)
|
41 |
+
os.makedirs(wav_processed_dir, exist_ok=True)
|
42 |
+
|
43 |
+
meta_data = list(tqdm(self.meta_data(), desc='Load meta data'))
|
44 |
+
item_names = [d['item_name'] for d in meta_data]
|
45 |
+
assert len(item_names) == len(set(item_names)), 'Key `item_name` should be Unique.'
|
46 |
+
|
47 |
+
# preprocess data
|
48 |
+
phone_list = []
|
49 |
+
word_list = []
|
50 |
+
spk_names = set()
|
51 |
+
process_item = partial(self.preprocess_first_pass,
|
52 |
+
txt_processor=self.txt_processor,
|
53 |
+
wav_processed_dir=wav_processed_dir,
|
54 |
+
wav_processed_tmp=wav_processed_tmp_dir,
|
55 |
+
preprocess_args=self.preprocess_args)
|
56 |
+
items = []
|
57 |
+
args = [{
|
58 |
+
'item_name': item_raw['item_name'],
|
59 |
+
'txt_raw': item_raw['txt'],
|
60 |
+
'wav_fn': item_raw['wav_fn'],
|
61 |
+
'txt_loader': item_raw.get('txt_loader'),
|
62 |
+
'others': item_raw.get('others', None)
|
63 |
+
} for item_raw in meta_data]
|
64 |
+
for item_, (item_id, item) in zip(meta_data, multiprocess_run_tqdm(process_item, args, desc='Preprocess')):
|
65 |
+
if item is not None:
|
66 |
+
item_.update(item)
|
67 |
+
item = item_
|
68 |
+
if 'txt_loader' in item:
|
69 |
+
del item['txt_loader']
|
70 |
+
item['id'] = item_id
|
71 |
+
item['spk_name'] = item.get('spk_name', '<SINGLE_SPK>')
|
72 |
+
item['others'] = item.get('others', None)
|
73 |
+
phone_list += item['ph'].split(" ")
|
74 |
+
word_list += item['word'].split(" ")
|
75 |
+
spk_names.add(item['spk_name'])
|
76 |
+
items.append(item)
|
77 |
+
|
78 |
+
# add encoded tokens
|
79 |
+
ph_encoder, word_encoder = self._phone_encoder(phone_list), self._word_encoder(word_list)
|
80 |
+
spk_map = self.build_spk_map(spk_names)
|
81 |
+
args = [{
|
82 |
+
'ph': item['ph'], 'word': item['word'], 'spk_name': item['spk_name'],
|
83 |
+
'word_encoder': word_encoder, 'ph_encoder': ph_encoder, 'spk_map': spk_map
|
84 |
+
} for item in items]
|
85 |
+
for idx, item_new_kv in multiprocess_run_tqdm(self.preprocess_second_pass, args, desc='Add encoded tokens'):
|
86 |
+
items[idx].update(item_new_kv)
|
87 |
+
|
88 |
+
# build mfa data
|
89 |
+
if self.preprocess_args['use_mfa']:
|
90 |
+
mfa_dict = set()
|
91 |
+
mfa_input_dir = f'{processed_dir}/mfa_inputs'
|
92 |
+
remove_file(mfa_input_dir)
|
93 |
+
# group MFA inputs for better parallelism
|
94 |
+
mfa_groups = [i // self.preprocess_args['nsample_per_mfa_group'] for i in range(len(items))]
|
95 |
+
if self.preprocess_args['mfa_group_shuffle']:
|
96 |
+
random.seed(hparams['seed'])
|
97 |
+
random.shuffle(mfa_groups)
|
98 |
+
args = [{
|
99 |
+
'item': item, 'mfa_input_dir': mfa_input_dir,
|
100 |
+
'mfa_group': mfa_group, 'wav_processed_tmp': wav_processed_tmp_dir,
|
101 |
+
'preprocess_args': self.preprocess_args
|
102 |
+
} for item, mfa_group in zip(items, mfa_groups)]
|
103 |
+
for i, (ph_gb_word_nosil, new_wav_align_fn) in multiprocess_run_tqdm(
|
104 |
+
self.build_mfa_inputs, args, desc='Build MFA data'):
|
105 |
+
items[i]['wav_align_fn'] = new_wav_align_fn
|
106 |
+
for w in ph_gb_word_nosil.split(" "):
|
107 |
+
mfa_dict.add(f"{w} {w.replace('_', ' ')}")
|
108 |
+
mfa_dict = sorted(mfa_dict)
|
109 |
+
with open(f'{processed_dir}/mfa_dict.txt', 'w') as f:
|
110 |
+
f.writelines([f'{l}\n' for l in mfa_dict])
|
111 |
+
with open(f"{processed_dir}/{self.meta_csv_filename}.json", 'w') as f:
|
112 |
+
f.write(re.sub(r'\n\s+([\d+\]])', r'\1', json.dumps(items, ensure_ascii=False, sort_keys=False, indent=1)))
|
113 |
+
remove_file(wav_processed_tmp_dir)
|
114 |
+
|
115 |
+
@classmethod
|
116 |
+
def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
|
117 |
+
wav_fn, wav_processed_dir, wav_processed_tmp,
|
118 |
+
preprocess_args, txt_loader=None, others=None):
|
119 |
+
try:
|
120 |
+
if txt_loader is not None:
|
121 |
+
txt_raw = txt_loader(txt_raw)
|
122 |
+
ph, txt, word, ph2word, ph_gb_word = cls.txt_to_ph(txt_processor, txt_raw, preprocess_args)
|
123 |
+
wav_fn, wav_align_fn = cls.process_wav(
|
124 |
+
item_name, wav_fn,
|
125 |
+
hparams['processed_data_dir'],
|
126 |
+
wav_processed_tmp, preprocess_args)
|
127 |
+
|
128 |
+
# wav for binarization
|
129 |
+
ext = os.path.splitext(wav_fn)[1]
|
130 |
+
os.makedirs(wav_processed_dir, exist_ok=True)
|
131 |
+
new_wav_fn = f"{wav_processed_dir}/{item_name}{ext}"
|
132 |
+
move_link_func = move_file if os.path.dirname(wav_fn) == wav_processed_tmp else link_file
|
133 |
+
move_link_func(wav_fn, new_wav_fn)
|
134 |
+
return {
|
135 |
+
'txt': txt, 'txt_raw': txt_raw, 'ph': ph,
|
136 |
+
'word': word, 'ph2word': ph2word, 'ph_gb_word': ph_gb_word,
|
137 |
+
'wav_fn': new_wav_fn, 'wav_align_fn': wav_align_fn,
|
138 |
+
'others': others
|
139 |
+
}
|
140 |
+
except:
|
141 |
+
traceback.print_exc()
|
142 |
+
print(f"| Error is caught. item_name: {item_name}.")
|
143 |
+
return None
|
144 |
+
|
145 |
+
@staticmethod
|
146 |
+
def txt_to_ph(txt_processor, txt_raw, preprocess_args):
|
147 |
+
txt_struct, txt = txt_processor.process(txt_raw, preprocess_args)
|
148 |
+
ph = [p for w in txt_struct for p in w[1]]
|
149 |
+
ph_gb_word = ["_".join(w[1]) for w in txt_struct]
|
150 |
+
words = [w[0] for w in txt_struct]
|
151 |
+
# word_id=0 is reserved for padding
|
152 |
+
ph2word = [w_id + 1 for w_id, w in enumerate(txt_struct) for _ in range(len(w[1]))]
|
153 |
+
return " ".join(ph), txt, " ".join(words), ph2word, " ".join(ph_gb_word)
|
154 |
+
|
155 |
+
@staticmethod
|
156 |
+
def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, preprocess_args):
|
157 |
+
processors = [get_wav_processor_cls(v) for v in preprocess_args['wav_processors']]
|
158 |
+
processors = [k() for k in processors if k is not None]
|
159 |
+
if len(processors) >= 1:
|
160 |
+
sr_file = librosa.core.get_samplerate(wav_fn)
|
161 |
+
output_fn_for_align = None
|
162 |
+
ext = os.path.splitext(wav_fn)[1]
|
163 |
+
input_fn = f"{wav_processed_tmp}/{item_name}{ext}"
|
164 |
+
link_file(wav_fn, input_fn)
|
165 |
+
for p in processors:
|
166 |
+
outputs = p.process(input_fn, sr_file, wav_processed_tmp, processed_dir, item_name, preprocess_args)
|
167 |
+
if len(outputs) == 3:
|
168 |
+
input_fn, sr, output_fn_for_align = outputs
|
169 |
+
else:
|
170 |
+
input_fn, sr = outputs
|
171 |
+
return input_fn, output_fn_for_align
|
172 |
+
else:
|
173 |
+
return wav_fn, wav_fn
|
174 |
+
|
175 |
+
def _phone_encoder(self, ph_set):
|
176 |
+
ph_set_fn = f"{self.processed_dir}/phone_set.json"
|
177 |
+
if self.preprocess_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
|
178 |
+
ph_set = sorted(set(ph_set))
|
179 |
+
json.dump(ph_set, open(ph_set_fn, 'w'), ensure_ascii=False)
|
180 |
+
print("| Build phone set: ", ph_set)
|
181 |
+
else:
|
182 |
+
ph_set = json.load(open(ph_set_fn, 'r'))
|
183 |
+
print("| Load phone set: ", ph_set)
|
184 |
+
return build_token_encoder(ph_set_fn)
|
185 |
+
|
186 |
+
def _word_encoder(self, word_set):
|
187 |
+
word_set_fn = f"{self.processed_dir}/word_set.json"
|
188 |
+
if self.preprocess_args['reset_word_dict']:
|
189 |
+
word_set = Counter(word_set)
|
190 |
+
total_words = sum(word_set.values())
|
191 |
+
word_set = word_set.most_common(hparams['word_dict_size'])
|
192 |
+
num_unk_words = total_words - sum([x[1] for x in word_set])
|
193 |
+
word_set = ['<BOS>', '<EOS>'] + [x[0] for x in word_set]
|
194 |
+
word_set = sorted(set(word_set))
|
195 |
+
json.dump(word_set, open(word_set_fn, 'w'), ensure_ascii=False)
|
196 |
+
print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
|
197 |
+
f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
|
198 |
+
else:
|
199 |
+
word_set = json.load(open(word_set_fn, 'r'))
|
200 |
+
print("| Load word set. Size: ", len(word_set), word_set[:10])
|
201 |
+
return build_token_encoder(word_set_fn)
|
202 |
+
|
203 |
+
@classmethod
|
204 |
+
def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_encoder, spk_map):
|
205 |
+
word_token = word_encoder.encode(word)
|
206 |
+
ph_token = ph_encoder.encode(ph)
|
207 |
+
spk_id = spk_map[spk_name]
|
208 |
+
return {'word_token': word_token, 'ph_token': ph_token, 'spk_id': spk_id}
|
209 |
+
|
210 |
+
def build_spk_map(self, spk_names):
|
211 |
+
spk_map = {x: i for i, x in enumerate(sorted(list(spk_names)))}
|
212 |
+
assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
|
213 |
+
print(f"| Number of spks: {len(spk_map)}, spk_map: {spk_map}")
|
214 |
+
json.dump(spk_map, open(self.spk_map_fn, 'w'), ensure_ascii=False)
|
215 |
+
return spk_map
|
216 |
+
|
217 |
+
@classmethod
|
218 |
+
def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processed_tmp, preprocess_args):
|
219 |
+
item_name = item['item_name']
|
220 |
+
wav_align_fn = item['wav_align_fn']
|
221 |
+
ph_gb_word = item['ph_gb_word']
|
222 |
+
ext = os.path.splitext(wav_align_fn)[1]
|
223 |
+
mfa_input_group_dir = f'{mfa_input_dir}/{mfa_group}'
|
224 |
+
os.makedirs(mfa_input_group_dir, exist_ok=True)
|
225 |
+
new_wav_align_fn = f"{mfa_input_group_dir}/{item_name}{ext}"
|
226 |
+
move_link_func = move_file if os.path.dirname(wav_align_fn) == wav_processed_tmp else link_file
|
227 |
+
move_link_func(wav_align_fn, new_wav_align_fn)
|
228 |
+
ph_gb_word_nosil = " ".join(["_".join([p for p in w.split("_") if not is_sil_phoneme(p)])
|
229 |
+
for w in ph_gb_word.split(" ") if not is_sil_phoneme(w)])
|
230 |
+
with open(f'{mfa_input_group_dir}/{item_name}.lab', 'w') as f_txt:
|
231 |
+
f_txt.write(ph_gb_word_nosil)
|
232 |
+
return ph_gb_word_nosil, new_wav_align_fn
|
233 |
+
|
234 |
+
def load_spk_map(self, base_dir):
|
235 |
+
spk_map_fn = f"{base_dir}/spk_map.json"
|
236 |
+
spk_map = json.load(open(spk_map_fn, 'r'))
|
237 |
+
return spk_map
|
238 |
+
|
239 |
+
def load_dict(self, base_dir):
|
240 |
+
ph_encoder = build_token_encoder(f'{base_dir}/phone_set.json')
|
241 |
+
word_encoder = build_token_encoder(f'{base_dir}/word_set.json')
|
242 |
+
return ph_encoder, word_encoder
|
243 |
+
|
244 |
+
@property
|
245 |
+
def meta_csv_filename(self):
|
246 |
+
return 'metadata'
|
247 |
+
|
248 |
+
@property
|
249 |
+
def wav_processed_dirname(self):
|
250 |
+
return 'wav_processed'
|
data_gen/tts/bin/binarize.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
4 |
+
|
5 |
+
import importlib
|
6 |
+
from utils.hparams import set_hparams, hparams
|
7 |
+
|
8 |
+
|
9 |
+
def binarize():
|
10 |
+
binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
|
11 |
+
pkg = ".".join(binarizer_cls.split(".")[:-1])
|
12 |
+
cls_name = binarizer_cls.split(".")[-1]
|
13 |
+
binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
|
14 |
+
print("| Binarizer: ", binarizer_cls)
|
15 |
+
binarizer_cls().process()
|
16 |
+
|
17 |
+
|
18 |
+
if __name__ == '__main__':
|
19 |
+
set_hparams()
|
20 |
+
binarize()
|
data_gen/tts/bin/pre_align.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
4 |
+
|
5 |
+
import importlib
|
6 |
+
from utils.hparams import set_hparams, hparams
|
7 |
+
|
8 |
+
|
9 |
+
def pre_align():
|
10 |
+
assert hparams['pre_align_cls'] != ''
|
11 |
+
|
12 |
+
pkg = ".".join(hparams["pre_align_cls"].split(".")[:-1])
|
13 |
+
cls_name = hparams["pre_align_cls"].split(".")[-1]
|
14 |
+
process_cls = getattr(importlib.import_module(pkg), cls_name)
|
15 |
+
process_cls().process()
|
16 |
+
|
17 |
+
|
18 |
+
if __name__ == '__main__':
|
19 |
+
set_hparams()
|
20 |
+
pre_align()
|
data_gen/tts/bin/train_mfa_align.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
from utils.hparams import hparams, set_hparams
|
3 |
+
import os
|
4 |
+
|
5 |
+
|
6 |
+
def train_mfa_align():
|
7 |
+
CORPUS = hparams['processed_data_dir'].split("/")[-1]
|
8 |
+
print(f"| Run MFA for {CORPUS}.")
|
9 |
+
NUM_JOB = int(os.getenv('N_PROC', os.cpu_count()))
|
10 |
+
subprocess.check_call(f'CORPUS={CORPUS} NUM_JOB={NUM_JOB} bash usr/run_mfa_train_align.sh', shell=True)
|
11 |
+
|
12 |
+
|
13 |
+
if __name__ == '__main__':
|
14 |
+
set_hparams(print_hparams=False)
|
15 |
+
train_mfa_align()
|
data_gen/tts/data_gen_utils.py
ADDED
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
|
3 |
+
warnings.filterwarnings("ignore")
|
4 |
+
|
5 |
+
# import parselmouth
|
6 |
+
import os
|
7 |
+
import torch
|
8 |
+
from skimage.transform import resize
|
9 |
+
from utils.text_encoder import TokenTextEncoder
|
10 |
+
from utils.pitch_utils import f0_to_coarse
|
11 |
+
import struct
|
12 |
+
import webrtcvad
|
13 |
+
from scipy.ndimage.morphology import binary_dilation
|
14 |
+
import librosa
|
15 |
+
import numpy as np
|
16 |
+
from utils import audio
|
17 |
+
import pyloudnorm as pyln
|
18 |
+
import re
|
19 |
+
import json
|
20 |
+
from collections import OrderedDict
|
21 |
+
|
22 |
+
PUNCS = '!,.?;:'
|
23 |
+
|
24 |
+
int16_max = (2 ** 15) - 1
|
25 |
+
|
26 |
+
|
27 |
+
def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, vad_max_silence_length=12):
|
28 |
+
"""
|
29 |
+
Ensures that segments without voice in the waveform remain no longer than a
|
30 |
+
threshold determined by the VAD parameters in params.py.
|
31 |
+
:param wav: the raw waveform as a numpy array of floats
|
32 |
+
:param vad_max_silence_length: Maximum number of consecutive silent frames a segment can have.
|
33 |
+
:return: the same waveform with silences trimmed away (length <= original wav length)
|
34 |
+
"""
|
35 |
+
|
36 |
+
## Voice Activation Detection
|
37 |
+
# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
|
38 |
+
# This sets the granularity of the VAD. Should not need to be changed.
|
39 |
+
sampling_rate = 16000
|
40 |
+
wav_raw, sr = librosa.core.load(path, sr=sr)
|
41 |
+
|
42 |
+
if norm:
|
43 |
+
meter = pyln.Meter(sr) # create BS.1770 meter
|
44 |
+
loudness = meter.integrated_loudness(wav_raw)
|
45 |
+
wav_raw = pyln.normalize.loudness(wav_raw, loudness, -20.0)
|
46 |
+
if np.abs(wav_raw).max() > 1.0:
|
47 |
+
wav_raw = wav_raw / np.abs(wav_raw).max()
|
48 |
+
|
49 |
+
wav = librosa.resample(wav_raw, sr, sampling_rate, res_type='kaiser_best')
|
50 |
+
|
51 |
+
vad_window_length = 30 # In milliseconds
|
52 |
+
# Number of frames to average together when performing the moving average smoothing.
|
53 |
+
# The larger this value, the larger the VAD variations must be to not get smoothed out.
|
54 |
+
vad_moving_average_width = 8
|
55 |
+
|
56 |
+
# Compute the voice detection window size
|
57 |
+
samples_per_window = (vad_window_length * sampling_rate) // 1000
|
58 |
+
|
59 |
+
# Trim the end of the audio to have a multiple of the window size
|
60 |
+
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
|
61 |
+
|
62 |
+
# Convert the float waveform to 16-bit mono PCM
|
63 |
+
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
|
64 |
+
|
65 |
+
# Perform voice activation detection
|
66 |
+
voice_flags = []
|
67 |
+
vad = webrtcvad.Vad(mode=3)
|
68 |
+
for window_start in range(0, len(wav), samples_per_window):
|
69 |
+
window_end = window_start + samples_per_window
|
70 |
+
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
|
71 |
+
sample_rate=sampling_rate))
|
72 |
+
voice_flags = np.array(voice_flags)
|
73 |
+
|
74 |
+
# Smooth the voice detection with a moving average
|
75 |
+
def moving_average(array, width):
|
76 |
+
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
|
77 |
+
ret = np.cumsum(array_padded, dtype=float)
|
78 |
+
ret[width:] = ret[width:] - ret[:-width]
|
79 |
+
return ret[width - 1:] / width
|
80 |
+
|
81 |
+
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
82 |
+
audio_mask = np.round(audio_mask).astype(np.bool)
|
83 |
+
|
84 |
+
# Dilate the voiced regions
|
85 |
+
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
86 |
+
audio_mask = np.repeat(audio_mask, samples_per_window)
|
87 |
+
audio_mask = resize(audio_mask, (len(wav_raw),)) > 0
|
88 |
+
if return_raw_wav:
|
89 |
+
return wav_raw, audio_mask, sr
|
90 |
+
return wav_raw[audio_mask], audio_mask, sr
|
91 |
+
|
92 |
+
|
93 |
+
def process_utterance(wav_path,
|
94 |
+
fft_size=1024,
|
95 |
+
hop_size=256,
|
96 |
+
win_length=1024,
|
97 |
+
window="hann",
|
98 |
+
num_mels=80,
|
99 |
+
fmin=80,
|
100 |
+
fmax=7600,
|
101 |
+
eps=1e-6,
|
102 |
+
sample_rate=22050,
|
103 |
+
loud_norm=False,
|
104 |
+
min_level_db=-100,
|
105 |
+
return_linear=False,
|
106 |
+
trim_long_sil=False, vocoder='pwg'):
|
107 |
+
if isinstance(wav_path, str):
|
108 |
+
if trim_long_sil:
|
109 |
+
wav, _, _ = trim_long_silences(wav_path, sample_rate)
|
110 |
+
else:
|
111 |
+
wav, _ = librosa.core.load(wav_path, sr=sample_rate)
|
112 |
+
else:
|
113 |
+
wav = wav_path
|
114 |
+
|
115 |
+
if loud_norm:
|
116 |
+
meter = pyln.Meter(sample_rate) # create BS.1770 meter
|
117 |
+
loudness = meter.integrated_loudness(wav)
|
118 |
+
wav = pyln.normalize.loudness(wav, loudness, -22.0)
|
119 |
+
if np.abs(wav).max() > 1:
|
120 |
+
wav = wav / np.abs(wav).max()
|
121 |
+
|
122 |
+
# get amplitude spectrogram
|
123 |
+
x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
|
124 |
+
win_length=win_length, window=window, pad_mode="constant")
|
125 |
+
spc = np.abs(x_stft) # (n_bins, T)
|
126 |
+
|
127 |
+
# get mel basis
|
128 |
+
fmin = 0 if fmin == -1 else fmin
|
129 |
+
fmax = sample_rate / 2 if fmax == -1 else fmax
|
130 |
+
mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
|
131 |
+
mel = mel_basis @ spc
|
132 |
+
|
133 |
+
if vocoder == 'pwg':
|
134 |
+
mel = np.log10(np.maximum(eps, mel)) # (n_mel_bins, T)
|
135 |
+
else:
|
136 |
+
assert False, f'"{vocoder}" is not in ["pwg"].'
|
137 |
+
|
138 |
+
l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1)
|
139 |
+
wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
|
140 |
+
wav = wav[:mel.shape[1] * hop_size]
|
141 |
+
|
142 |
+
if not return_linear:
|
143 |
+
return wav, mel
|
144 |
+
else:
|
145 |
+
spc = audio.amp_to_db(spc)
|
146 |
+
spc = audio.normalize(spc, {'min_level_db': min_level_db})
|
147 |
+
return wav, mel, spc
|
148 |
+
|
149 |
+
|
150 |
+
def get_pitch(wav_data, mel, hparams):
|
151 |
+
"""
|
152 |
+
|
153 |
+
:param wav_data: [T]
|
154 |
+
:param mel: [T, 80]
|
155 |
+
:param hparams:
|
156 |
+
:return:
|
157 |
+
"""
|
158 |
+
time_step = hparams['hop_size'] / hparams['audio_sample_rate'] * 1000
|
159 |
+
f0_min = 80
|
160 |
+
f0_max = 750
|
161 |
+
|
162 |
+
if hparams['hop_size'] == 128:
|
163 |
+
pad_size = 4
|
164 |
+
elif hparams['hop_size'] == 256:
|
165 |
+
pad_size = 2
|
166 |
+
else:
|
167 |
+
assert False
|
168 |
+
import parselmouth
|
169 |
+
f0 = parselmouth.Sound(wav_data, hparams['audio_sample_rate']).to_pitch_ac(
|
170 |
+
time_step=time_step / 1000, voicing_threshold=0.6,
|
171 |
+
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
|
172 |
+
lpad = pad_size * 2
|
173 |
+
rpad = len(mel) - len(f0) - lpad
|
174 |
+
f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
|
175 |
+
# mel and f0 are extracted by 2 different libraries. we should force them to have the same length.
|
176 |
+
# Attention: we find that new version of some libraries could cause ``rpad'' to be a negetive value...
|
177 |
+
# Just to be sure, we recommend users to set up the same environments as them in requirements_auto.txt (by Anaconda)
|
178 |
+
delta_l = len(mel) - len(f0)
|
179 |
+
assert np.abs(delta_l) <= 8
|
180 |
+
if delta_l > 0:
|
181 |
+
f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
|
182 |
+
f0 = f0[:len(mel)]
|
183 |
+
pitch_coarse = f0_to_coarse(f0)
|
184 |
+
return f0, pitch_coarse
|
185 |
+
|
186 |
+
|
187 |
+
def remove_empty_lines(text):
|
188 |
+
"""remove empty lines"""
|
189 |
+
assert (len(text) > 0)
|
190 |
+
assert (isinstance(text, list))
|
191 |
+
text = [t.strip() for t in text]
|
192 |
+
if "" in text:
|
193 |
+
text.remove("")
|
194 |
+
return text
|
195 |
+
|
196 |
+
|
197 |
+
class TextGrid(object):
|
198 |
+
def __init__(self, text):
|
199 |
+
text = remove_empty_lines(text)
|
200 |
+
self.text = text
|
201 |
+
self.line_count = 0
|
202 |
+
self._get_type()
|
203 |
+
self._get_time_intval()
|
204 |
+
self._get_size()
|
205 |
+
self.tier_list = []
|
206 |
+
self._get_item_list()
|
207 |
+
|
208 |
+
def _extract_pattern(self, pattern, inc):
|
209 |
+
"""
|
210 |
+
Parameters
|
211 |
+
----------
|
212 |
+
pattern : regex to extract pattern
|
213 |
+
inc : increment of line count after extraction
|
214 |
+
Returns
|
215 |
+
-------
|
216 |
+
group : extracted info
|
217 |
+
"""
|
218 |
+
try:
|
219 |
+
group = re.match(pattern, self.text[self.line_count]).group(1)
|
220 |
+
self.line_count += inc
|
221 |
+
except AttributeError:
|
222 |
+
raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count]))
|
223 |
+
return group
|
224 |
+
|
225 |
+
def _get_type(self):
|
226 |
+
self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
|
227 |
+
|
228 |
+
def _get_time_intval(self):
|
229 |
+
self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
|
230 |
+
self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
|
231 |
+
|
232 |
+
def _get_size(self):
|
233 |
+
self.size = int(self._extract_pattern(r"size = (.*)", 2))
|
234 |
+
|
235 |
+
def _get_item_list(self):
|
236 |
+
"""Only supports IntervalTier currently"""
|
237 |
+
for itemIdx in range(1, self.size + 1):
|
238 |
+
tier = OrderedDict()
|
239 |
+
item_list = []
|
240 |
+
tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
|
241 |
+
tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
|
242 |
+
if tier_class != "IntervalTier":
|
243 |
+
raise NotImplementedError("Only IntervalTier class is supported currently")
|
244 |
+
tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
|
245 |
+
tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
|
246 |
+
tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
|
247 |
+
tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
|
248 |
+
for i in range(int(tier_size)):
|
249 |
+
item = OrderedDict()
|
250 |
+
item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
|
251 |
+
item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
|
252 |
+
item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
|
253 |
+
item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
|
254 |
+
item_list.append(item)
|
255 |
+
tier["idx"] = tier_idx
|
256 |
+
tier["class"] = tier_class
|
257 |
+
tier["name"] = tier_name
|
258 |
+
tier["xmin"] = tier_xmin
|
259 |
+
tier["xmax"] = tier_xmax
|
260 |
+
tier["size"] = tier_size
|
261 |
+
tier["items"] = item_list
|
262 |
+
self.tier_list.append(tier)
|
263 |
+
|
264 |
+
def toJson(self):
|
265 |
+
_json = OrderedDict()
|
266 |
+
_json["file_type"] = self.file_type
|
267 |
+
_json["xmin"] = self.xmin
|
268 |
+
_json["xmax"] = self.xmax
|
269 |
+
_json["size"] = self.size
|
270 |
+
_json["tiers"] = self.tier_list
|
271 |
+
return json.dumps(_json, ensure_ascii=False, indent=2)
|
272 |
+
|
273 |
+
|
274 |
+
def get_mel2ph(tg_fn, ph, mel, hparams):
|
275 |
+
ph_list = ph.split(" ")
|
276 |
+
with open(tg_fn, "r") as f:
|
277 |
+
tg = f.readlines()
|
278 |
+
tg = remove_empty_lines(tg)
|
279 |
+
tg = TextGrid(tg)
|
280 |
+
tg = json.loads(tg.toJson())
|
281 |
+
split = np.ones(len(ph_list) + 1, np.float) * -1
|
282 |
+
tg_idx = 0
|
283 |
+
ph_idx = 0
|
284 |
+
tg_align = [x for x in tg['tiers'][-1]['items']]
|
285 |
+
tg_align_ = []
|
286 |
+
for x in tg_align:
|
287 |
+
x['xmin'] = float(x['xmin'])
|
288 |
+
x['xmax'] = float(x['xmax'])
|
289 |
+
if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC']:
|
290 |
+
x['text'] = ''
|
291 |
+
if len(tg_align_) > 0 and tg_align_[-1]['text'] == '':
|
292 |
+
tg_align_[-1]['xmax'] = x['xmax']
|
293 |
+
continue
|
294 |
+
tg_align_.append(x)
|
295 |
+
tg_align = tg_align_
|
296 |
+
tg_len = len([x for x in tg_align if x['text'] != ''])
|
297 |
+
ph_len = len([x for x in ph_list if not is_sil_phoneme(x)])
|
298 |
+
assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, tg_fn)
|
299 |
+
while tg_idx < len(tg_align) or ph_idx < len(ph_list):
|
300 |
+
if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]):
|
301 |
+
split[ph_idx] = 1e8
|
302 |
+
ph_idx += 1
|
303 |
+
continue
|
304 |
+
x = tg_align[tg_idx]
|
305 |
+
if x['text'] == '' and ph_idx == len(ph_list):
|
306 |
+
tg_idx += 1
|
307 |
+
continue
|
308 |
+
assert ph_idx < len(ph_list), (tg_len, ph_len, tg_align, ph_list, tg_fn)
|
309 |
+
ph = ph_list[ph_idx]
|
310 |
+
if x['text'] == '' and not is_sil_phoneme(ph):
|
311 |
+
assert False, (ph_list, tg_align)
|
312 |
+
if x['text'] != '' and is_sil_phoneme(ph):
|
313 |
+
ph_idx += 1
|
314 |
+
else:
|
315 |
+
assert (x['text'] == '' and is_sil_phoneme(ph)) \
|
316 |
+
or x['text'].lower() == ph.lower() \
|
317 |
+
or x['text'].lower() == 'sil', (x['text'], ph)
|
318 |
+
split[ph_idx] = x['xmin']
|
319 |
+
if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme(ph_list[ph_idx - 1]):
|
320 |
+
split[ph_idx - 1] = split[ph_idx]
|
321 |
+
ph_idx += 1
|
322 |
+
tg_idx += 1
|
323 |
+
assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align])
|
324 |
+
assert ph_idx >= len(ph_list) - 1, (ph_idx, ph_list, len(ph_list), [x['text'] for x in tg_align], tg_fn)
|
325 |
+
mel2ph = np.zeros([mel.shape[0]], np.int)
|
326 |
+
split[0] = 0
|
327 |
+
split[-1] = 1e8
|
328 |
+
for i in range(len(split) - 1):
|
329 |
+
assert split[i] != -1 and split[i] <= split[i + 1], (split[:-1],)
|
330 |
+
split = [int(s * hparams['audio_sample_rate'] / hparams['hop_size'] + 0.5) for s in split]
|
331 |
+
for ph_idx in range(len(ph_list)):
|
332 |
+
mel2ph[split[ph_idx]:split[ph_idx + 1]] = ph_idx + 1
|
333 |
+
mel2ph_torch = torch.from_numpy(mel2ph)
|
334 |
+
T_t = len(ph_list)
|
335 |
+
dur = mel2ph_torch.new_zeros([T_t + 1]).scatter_add(0, mel2ph_torch, torch.ones_like(mel2ph_torch))
|
336 |
+
dur = dur[1:].numpy()
|
337 |
+
return mel2ph, dur
|
338 |
+
|
339 |
+
|
340 |
+
def build_phone_encoder(data_dir):
|
341 |
+
phone_list_file = os.path.join(data_dir, 'phone_set.json')
|
342 |
+
phone_list = json.load(open(phone_list_file))
|
343 |
+
return TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
|
344 |
+
|
345 |
+
def build_word_encoder(data_dir):
|
346 |
+
word_list_file = os.path.join(data_dir, 'word_set.json')
|
347 |
+
word_list = json.load(open(word_list_file))
|
348 |
+
return TokenTextEncoder(None, vocab_list=word_list, replace_oov=',')
|
349 |
+
|
350 |
+
def is_sil_phoneme(p):
|
351 |
+
return not p[0].isalpha()
|
352 |
+
|
353 |
+
|
354 |
+
def build_token_encoder(token_list_file):
|
355 |
+
token_list = json.load(open(token_list_file))
|
356 |
+
return TokenTextEncoder(None, vocab_list=token_list, replace_oov='<UNK>')
|
data_gen/tts/emotion/audio.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from scipy.ndimage.morphology import binary_dilation
|
2 |
+
from data_gen.tts.emotion.params_data import *
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Optional, Union
|
5 |
+
import numpy as np
|
6 |
+
import webrtcvad
|
7 |
+
import librosa
|
8 |
+
import struct
|
9 |
+
|
10 |
+
int16_max = (2 ** 15) - 1
|
11 |
+
|
12 |
+
|
13 |
+
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
|
14 |
+
source_sr: Optional[int] = None):
|
15 |
+
"""
|
16 |
+
Applies the preprocessing operations used in training the Speaker Encoder to a waveform
|
17 |
+
either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
|
18 |
+
|
19 |
+
:param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
|
20 |
+
just .wav), either the waveform as a numpy array of floats.
|
21 |
+
:param source_sr: if passing an audio waveform, the sampling rate of the waveform before
|
22 |
+
preprocessing. After preprocessing, the waveform's sampling rate will match the data
|
23 |
+
hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
|
24 |
+
this argument will be ignored.
|
25 |
+
"""
|
26 |
+
# Load the wav from disk if needed
|
27 |
+
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
|
28 |
+
wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
|
29 |
+
else:
|
30 |
+
wav = fpath_or_wav
|
31 |
+
|
32 |
+
# Resample the wav if needed
|
33 |
+
if source_sr is not None and source_sr != sampling_rate:
|
34 |
+
wav = librosa.resample(wav, source_sr, sampling_rate)
|
35 |
+
|
36 |
+
# Apply the preprocessing: normalize volume and shorten long silences
|
37 |
+
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
|
38 |
+
wav = trim_long_silences(wav)
|
39 |
+
|
40 |
+
return wav
|
41 |
+
|
42 |
+
|
43 |
+
def wav_to_mel_spectrogram(wav):
|
44 |
+
"""
|
45 |
+
Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
|
46 |
+
Note: this not a log-mel spectrogram.
|
47 |
+
"""
|
48 |
+
frames = librosa.feature.melspectrogram(
|
49 |
+
wav,
|
50 |
+
sampling_rate,
|
51 |
+
n_fft=int(sampling_rate * mel_window_length / 1000),
|
52 |
+
hop_length=int(sampling_rate * mel_window_step / 1000),
|
53 |
+
n_mels=mel_n_channels
|
54 |
+
)
|
55 |
+
return frames.astype(np.float32).T
|
56 |
+
|
57 |
+
|
58 |
+
def trim_long_silences(wav):
|
59 |
+
"""
|
60 |
+
Ensures that segments without voice in the waveform remain no longer than a
|
61 |
+
threshold determined by the VAD parameters in params.py.
|
62 |
+
|
63 |
+
:param wav: the raw waveform as a numpy array of floats
|
64 |
+
:return: the same waveform with silences trimmed away (length <= original wav length)
|
65 |
+
"""
|
66 |
+
# Compute the voice detection window size
|
67 |
+
samples_per_window = (vad_window_length * sampling_rate) // 1000
|
68 |
+
|
69 |
+
# Trim the end of the audio to have a multiple of the window size
|
70 |
+
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
|
71 |
+
|
72 |
+
# Convert the float waveform to 16-bit mono PCM
|
73 |
+
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
|
74 |
+
|
75 |
+
# Perform voice activation detection
|
76 |
+
voice_flags = []
|
77 |
+
vad = webrtcvad.Vad(mode=3)
|
78 |
+
for window_start in range(0, len(wav), samples_per_window):
|
79 |
+
window_end = window_start + samples_per_window
|
80 |
+
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
|
81 |
+
sample_rate=sampling_rate))
|
82 |
+
voice_flags = np.array(voice_flags)
|
83 |
+
|
84 |
+
# Smooth the voice detection with a moving average
|
85 |
+
def moving_average(array, width):
|
86 |
+
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
|
87 |
+
ret = np.cumsum(array_padded, dtype=float)
|
88 |
+
ret[width:] = ret[width:] - ret[:-width]
|
89 |
+
return ret[width - 1:] / width
|
90 |
+
|
91 |
+
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
92 |
+
audio_mask = np.round(audio_mask).astype(np.bool)
|
93 |
+
|
94 |
+
# Dilate the voiced regions
|
95 |
+
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
96 |
+
audio_mask = np.repeat(audio_mask, samples_per_window)
|
97 |
+
|
98 |
+
return wav[audio_mask == True]
|
99 |
+
|
100 |
+
|
101 |
+
def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
|
102 |
+
if increase_only and decrease_only:
|
103 |
+
raise ValueError("Both increase only and decrease only are set")
|
104 |
+
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
|
105 |
+
if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
|
106 |
+
return wav
|
107 |
+
return wav * (10 ** (dBFS_change / 20))
|
data_gen/tts/emotion/inference.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from data_gen.tts.emotion.params_data import *
|
2 |
+
from data_gen.tts.emotion.model import EmotionEncoder
|
3 |
+
from data_gen.tts.emotion.audio import preprocess_wav # We want to expose this function from here
|
4 |
+
from matplotlib import cm
|
5 |
+
from data_gen.tts.emotion import audio
|
6 |
+
from pathlib import Path
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import numpy as np
|
9 |
+
import torch
|
10 |
+
|
11 |
+
_model = None # type: EmotionEncoder
|
12 |
+
_device = None # type: torch.device
|
13 |
+
|
14 |
+
|
15 |
+
def load_model(weights_fpath: Path, device=None):
|
16 |
+
"""
|
17 |
+
Loads the model in memory. If this function is not explicitely called, it will be run on the
|
18 |
+
first call to embed_frames() with the default weights file.
|
19 |
+
|
20 |
+
:param weights_fpath: the path to saved model weights.
|
21 |
+
:param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
|
22 |
+
model will be loaded and will run on this device. Outputs will however always be on the cpu.
|
23 |
+
If None, will default to your GPU if it"s available, otherwise your CPU.
|
24 |
+
"""
|
25 |
+
# TODO: I think the slow loading of the encoder might have something to do with the device it
|
26 |
+
# was saved on. Worth investigating.
|
27 |
+
global _model, _device
|
28 |
+
if device is None:
|
29 |
+
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
30 |
+
elif isinstance(device, str):
|
31 |
+
_device = torch.device(device)
|
32 |
+
_model = EmotionEncoder(_device, torch.device("cpu"))
|
33 |
+
checkpoint = torch.load(weights_fpath)
|
34 |
+
_model.load_state_dict(checkpoint["model_state"])
|
35 |
+
_model.eval()
|
36 |
+
print("Loaded encoder trained to step %d" % (checkpoint["step"]))
|
37 |
+
|
38 |
+
|
39 |
+
def is_loaded():
|
40 |
+
return _model is not None
|
41 |
+
|
42 |
+
|
43 |
+
def embed_frames_batch(frames_batch):
|
44 |
+
"""
|
45 |
+
Computes embeddings for a batch of mel spectrogram.
|
46 |
+
|
47 |
+
:param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
|
48 |
+
(batch_size, n_frames, n_channels)
|
49 |
+
:return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
|
50 |
+
"""
|
51 |
+
if _model is None:
|
52 |
+
raise Exception("Model was not loaded. Call load_model() before inference.")
|
53 |
+
|
54 |
+
frames = torch.from_numpy(frames_batch).to(_device)
|
55 |
+
embed = _model.inference(frames).detach().cpu().numpy()
|
56 |
+
return embed
|
57 |
+
|
58 |
+
|
59 |
+
def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
|
60 |
+
min_pad_coverage=0.75, overlap=0.5):
|
61 |
+
"""
|
62 |
+
Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
|
63 |
+
partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
|
64 |
+
spectrogram slices are returned, so as to make each partial utterance waveform correspond to
|
65 |
+
its spectrogram. This function assumes that the mel spectrogram parameters used are those
|
66 |
+
defined in params_data.py.
|
67 |
+
|
68 |
+
The returned ranges may be indexing further than the length of the waveform. It is
|
69 |
+
recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
|
70 |
+
|
71 |
+
:param n_samples: the number of samples in the waveform
|
72 |
+
:param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
|
73 |
+
utterance
|
74 |
+
:param min_pad_coverage: when reaching the last partial utterance, it may or may not have
|
75 |
+
enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
|
76 |
+
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
|
77 |
+
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
|
78 |
+
utterance, this parameter is ignored so that the function always returns at least 1 slice.
|
79 |
+
:param overlap: by how much the partial utterance should overlap. If set to 0, the partial
|
80 |
+
utterances are entirely disjoint.
|
81 |
+
:return: the waveform slices and mel spectrogram slices as lists of array slices. Index
|
82 |
+
respectively the waveform and the mel spectrogram with these slices to obtain the partial
|
83 |
+
utterances.
|
84 |
+
"""
|
85 |
+
assert 0 <= overlap < 1
|
86 |
+
assert 0 < min_pad_coverage <= 1
|
87 |
+
|
88 |
+
samples_per_frame = int((sampling_rate * mel_window_step / 1000))
|
89 |
+
n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
|
90 |
+
frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
|
91 |
+
|
92 |
+
# Compute the slices
|
93 |
+
wav_slices, mel_slices = [], []
|
94 |
+
steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
|
95 |
+
for i in range(0, steps, frame_step):
|
96 |
+
mel_range = np.array([i, i + partial_utterance_n_frames])
|
97 |
+
wav_range = mel_range * samples_per_frame
|
98 |
+
mel_slices.append(slice(*mel_range))
|
99 |
+
wav_slices.append(slice(*wav_range))
|
100 |
+
|
101 |
+
# Evaluate whether extra padding is warranted or not
|
102 |
+
last_wav_range = wav_slices[-1]
|
103 |
+
coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
|
104 |
+
if coverage < min_pad_coverage and len(mel_slices) > 1:
|
105 |
+
mel_slices = mel_slices[:-1]
|
106 |
+
wav_slices = wav_slices[:-1]
|
107 |
+
|
108 |
+
return wav_slices, mel_slices
|
109 |
+
|
110 |
+
|
111 |
+
def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
|
112 |
+
"""
|
113 |
+
Computes an embedding for a single utterance.
|
114 |
+
|
115 |
+
# TODO: handle multiple wavs to benefit from batching on GPU
|
116 |
+
:param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
|
117 |
+
:param using_partials: if True, then the utterance is split in partial utterances of
|
118 |
+
<partial_utterance_n_frames> frames and the utterance embedding is computed from their
|
119 |
+
normalized average. If False, the utterance is instead computed from feeding the entire
|
120 |
+
spectogram to the network.
|
121 |
+
:param return_partials: if True, the partial embeddings will also be returned along with the
|
122 |
+
wav slices that correspond to the partial embeddings.
|
123 |
+
:param kwargs: additional arguments to compute_partial_splits()
|
124 |
+
:return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
|
125 |
+
<return_partials> is True, the partial utterances as a numpy array of float32 of shape
|
126 |
+
(n_partials, model_embedding_size) and the wav partials as a list of slices will also be
|
127 |
+
returned. If <using_partials> is simultaneously set to False, both these values will be None
|
128 |
+
instead.
|
129 |
+
"""
|
130 |
+
# Process the entire utterance if not using partials
|
131 |
+
if not using_partials:
|
132 |
+
frames = audio.wav_to_mel_spectrogram(wav)
|
133 |
+
embed = embed_frames_batch(frames[None, ...])[0]
|
134 |
+
if return_partials:
|
135 |
+
return embed, None, None
|
136 |
+
return embed
|
137 |
+
|
138 |
+
# Compute where to split the utterance into partials and pad if necessary
|
139 |
+
wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
|
140 |
+
max_wave_length = wave_slices[-1].stop
|
141 |
+
if max_wave_length >= len(wav):
|
142 |
+
wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
|
143 |
+
|
144 |
+
# Split the utterance into partials
|
145 |
+
frames = audio.wav_to_mel_spectrogram(wav)
|
146 |
+
frames_batch = np.array([frames[s] for s in mel_slices])
|
147 |
+
partial_embeds = embed_frames_batch(frames_batch)
|
148 |
+
|
149 |
+
# Compute the utterance embedding from the partial embeddings
|
150 |
+
raw_embed = np.mean(partial_embeds, axis=0)
|
151 |
+
embed = raw_embed / np.linalg.norm(raw_embed, 2)
|
152 |
+
|
153 |
+
if return_partials:
|
154 |
+
return embed, partial_embeds, wave_slices
|
155 |
+
return embed
|
156 |
+
|
157 |
+
|
158 |
+
def embed_speaker(wavs, **kwargs):
|
159 |
+
raise NotImplemented()
|
160 |
+
|
161 |
+
|
162 |
+
def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
|
163 |
+
if ax is None:
|
164 |
+
ax = plt.gca()
|
165 |
+
|
166 |
+
if shape is None:
|
167 |
+
height = int(np.sqrt(len(embed)))
|
168 |
+
shape = (height, -1)
|
169 |
+
embed = embed.reshape(shape)
|
170 |
+
|
171 |
+
cmap = cm.get_cmap()
|
172 |
+
mappable = ax.imshow(embed, cmap=cmap)
|
173 |
+
cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
|
174 |
+
cbar.set_clim(*color_range)
|
175 |
+
|
176 |
+
ax.set_xticks([]), ax.set_yticks([])
|
177 |
+
ax.set_title(title)
|
data_gen/tts/emotion/model.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from data_gen.tts.emotion.params_model import *
|
3 |
+
from data_gen.tts.emotion.params_data import *
|
4 |
+
from torch.nn.utils import clip_grad_norm_
|
5 |
+
from scipy.optimize import brentq
|
6 |
+
from torch import nn
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
|
10 |
+
|
11 |
+
class EmotionEncoder(nn.Module):
|
12 |
+
def __init__(self, device, loss_device):
|
13 |
+
super().__init__()
|
14 |
+
self.loss_device = loss_device
|
15 |
+
|
16 |
+
# Network defition
|
17 |
+
self.lstm = nn.LSTM(input_size=mel_n_channels,
|
18 |
+
hidden_size=model_hidden_size,
|
19 |
+
num_layers=model_num_layers,
|
20 |
+
batch_first=True).to(device)
|
21 |
+
self.linear = nn.Linear(in_features=model_hidden_size,
|
22 |
+
out_features=model_embedding_size).to(device)
|
23 |
+
self.relu = torch.nn.ReLU().to(device)
|
24 |
+
|
25 |
+
|
26 |
+
# Cosine similarity scaling (with fixed initial parameter values)
|
27 |
+
self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
|
28 |
+
self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
|
29 |
+
|
30 |
+
# Loss
|
31 |
+
self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
|
32 |
+
|
33 |
+
def do_gradient_ops(self):
|
34 |
+
# Gradient scale
|
35 |
+
self.similarity_weight.grad *= 0.01
|
36 |
+
self.similarity_bias.grad *= 0.01
|
37 |
+
|
38 |
+
# Gradient clipping
|
39 |
+
clip_grad_norm_(self.parameters(), 3, norm_type=2)
|
40 |
+
|
41 |
+
def forward(self, utterances, hidden_init=None):
|
42 |
+
"""
|
43 |
+
Computes the embeddings of a batch of utterance spectrograms.
|
44 |
+
|
45 |
+
:param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
|
46 |
+
(batch_size, n_frames, n_channels)
|
47 |
+
:param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
|
48 |
+
batch_size, hidden_size). Will default to a tensor of zeros if None.
|
49 |
+
:return: the embeddings as a tensor of shape (batch_size, embedding_size)
|
50 |
+
"""
|
51 |
+
# Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
|
52 |
+
# and the final cell state.
|
53 |
+
out, (hidden, cell) = self.lstm(utterances, hidden_init)
|
54 |
+
|
55 |
+
# We take only the hidden state of the last layer
|
56 |
+
embeds_raw = self.relu(self.linear(hidden[-1]))
|
57 |
+
|
58 |
+
# L2-normalize it
|
59 |
+
embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
|
60 |
+
|
61 |
+
return embeds
|
62 |
+
|
63 |
+
def inference(self, utterances, hidden_init=None):
|
64 |
+
"""
|
65 |
+
Computes the embeddings of a batch of utterance spectrograms.
|
66 |
+
|
67 |
+
:param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
|
68 |
+
(batch_size, n_frames, n_channels)
|
69 |
+
:param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
|
70 |
+
batch_size, hidden_size). Will default to a tensor of zeros if None.
|
71 |
+
:return: the embeddings as a tensor of shape (batch_size, embedding_size)
|
72 |
+
"""
|
73 |
+
# Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
|
74 |
+
# and the final cell state.
|
75 |
+
|
76 |
+
out, (hidden, cell) = self.lstm(utterances, hidden_init)
|
77 |
+
|
78 |
+
return hidden[-1]
|
data_gen/tts/emotion/params_data.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
## Mel-filterbank
|
3 |
+
mel_window_length = 25 # In milliseconds
|
4 |
+
mel_window_step = 10 # In milliseconds
|
5 |
+
mel_n_channels = 40
|
6 |
+
|
7 |
+
|
8 |
+
## Audio
|
9 |
+
sampling_rate = 16000
|
10 |
+
# Number of spectrogram frames in a partial utterance
|
11 |
+
partials_n_frames = 160 # 1600 ms
|
12 |
+
# Number of spectrogram frames at inference
|
13 |
+
inference_n_frames = 80 # 800 ms
|
14 |
+
|
15 |
+
|
16 |
+
## Voice Activation Detection
|
17 |
+
# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
|
18 |
+
# This sets the granularity of the VAD. Should not need to be changed.
|
19 |
+
vad_window_length = 30 # In milliseconds
|
20 |
+
# Number of frames to average together when performing the moving average smoothing.
|
21 |
+
# The larger this value, the larger the VAD variations must be to not get smoothed out.
|
22 |
+
vad_moving_average_width = 8
|
23 |
+
# Maximum number of consecutive silent frames a segment can have.
|
24 |
+
vad_max_silence_length = 6
|
25 |
+
|
26 |
+
|
27 |
+
## Audio volume normalization
|
28 |
+
audio_norm_target_dBFS = -30
|
29 |
+
|
data_gen/tts/emotion/params_model.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
## Model parameters
|
3 |
+
model_hidden_size = 256
|
4 |
+
model_embedding_size = 256
|
5 |
+
model_num_layers = 3
|
6 |
+
|
7 |
+
|
8 |
+
## Training parameters
|
9 |
+
learning_rate_init = 1e-4
|
10 |
+
speakers_per_batch = 6
|
11 |
+
utterances_per_speaker = 20
|
data_gen/tts/emotion/test_emotion.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3 -u
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the MIT license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
"""
|
8 |
+
Run inference for pre-processed data with a trained model.
|
9 |
+
"""
|
10 |
+
|
11 |
+
import logging
|
12 |
+
import math
|
13 |
+
import numpy, math, pdb, sys, random
|
14 |
+
import time, os, itertools, shutil, importlib
|
15 |
+
import argparse
|
16 |
+
import os
|
17 |
+
import sys
|
18 |
+
import glob
|
19 |
+
from sklearn import metrics
|
20 |
+
import soundfile as sf
|
21 |
+
#import sentencepiece as spm
|
22 |
+
import torch
|
23 |
+
import inference as encoder
|
24 |
+
import torch.nn as nn
|
25 |
+
import torch.nn.functional as F
|
26 |
+
from pathlib import Path
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
logger.setLevel(logging.INFO)
|
29 |
+
from resemblyzer import VoiceEncoder, preprocess_wav
|
30 |
+
|
31 |
+
|
32 |
+
def tuneThresholdfromScore(scores, labels, target_fa, target_fr=None):
|
33 |
+
fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
|
34 |
+
fnr = 1 - tpr
|
35 |
+
|
36 |
+
fnr = fnr * 100
|
37 |
+
fpr = fpr * 100
|
38 |
+
|
39 |
+
tunedThreshold = [];
|
40 |
+
if target_fr:
|
41 |
+
for tfr in target_fr:
|
42 |
+
idx = numpy.nanargmin(numpy.absolute((tfr - fnr)))
|
43 |
+
tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
|
44 |
+
|
45 |
+
for tfa in target_fa:
|
46 |
+
idx = numpy.nanargmin(numpy.absolute((tfa - fpr))) # numpy.where(fpr<=tfa)[0][-1]
|
47 |
+
tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
|
48 |
+
|
49 |
+
idxE = numpy.nanargmin(numpy.absolute((fnr - fpr)))
|
50 |
+
eer = max(fpr[idxE], fnr[idxE])
|
51 |
+
|
52 |
+
return (tunedThreshold, eer, fpr, fnr);
|
53 |
+
|
54 |
+
|
55 |
+
def loadWAV(filename, max_frames, evalmode=True, num_eval=10):
|
56 |
+
# Maximum audio length
|
57 |
+
max_audio = max_frames * 160 + 240
|
58 |
+
|
59 |
+
# Read wav file and convert to torch tensor
|
60 |
+
audio,sample_rate = sf.read(filename)
|
61 |
+
|
62 |
+
feats_v0 = torch.from_numpy(audio).float()
|
63 |
+
audiosize = audio.shape[0]
|
64 |
+
|
65 |
+
if audiosize <= max_audio:
|
66 |
+
shortage = math.floor((max_audio - audiosize + 1) / 2)
|
67 |
+
audio = numpy.pad(audio, (shortage, shortage), 'constant', constant_values=0)
|
68 |
+
audiosize = audio.shape[0]
|
69 |
+
|
70 |
+
if evalmode:
|
71 |
+
startframe = numpy.linspace(0, audiosize - max_audio, num=num_eval)
|
72 |
+
else:
|
73 |
+
startframe = numpy.array([numpy.int64(random.random() * (audiosize - max_audio))])
|
74 |
+
feats = []
|
75 |
+
if evalmode and max_frames == 0:
|
76 |
+
feats.append(audio)
|
77 |
+
else:
|
78 |
+
for asf in startframe:
|
79 |
+
feats.append(audio[int(asf):int(asf) + max_audio])
|
80 |
+
feat = numpy.stack(feats, axis=0)
|
81 |
+
feat = torch.FloatTensor(feat)
|
82 |
+
return feat;
|
83 |
+
|
84 |
+
def evaluateFromList(listfilename, print_interval=100, test_path='', multi=False):
|
85 |
+
|
86 |
+
lines = []
|
87 |
+
files = []
|
88 |
+
feats = {}
|
89 |
+
tstart = time.time()
|
90 |
+
|
91 |
+
## Read all lines
|
92 |
+
with open(listfilename) as listfile:
|
93 |
+
while True:
|
94 |
+
line = listfile.readline();
|
95 |
+
if (not line):
|
96 |
+
break;
|
97 |
+
|
98 |
+
data = line.split();
|
99 |
+
|
100 |
+
## Append random label if missing
|
101 |
+
if len(data) == 2: data = [random.randint(0,1)] + data
|
102 |
+
|
103 |
+
files.append(data[1])
|
104 |
+
files.append(data[2])
|
105 |
+
lines.append(line)
|
106 |
+
|
107 |
+
setfiles = list(set(files))
|
108 |
+
setfiles.sort()
|
109 |
+
## Save all features to file
|
110 |
+
for idx, file in enumerate(setfiles):
|
111 |
+
# preprocessed_wav = encoder.preprocess_wav(os.path.join(test_path,file))
|
112 |
+
# embed = encoder.embed_utterance(preprocessed_wav)
|
113 |
+
processed_wav = preprocess_wav(os.path.join(test_path,file))
|
114 |
+
embed = voice_encoder.embed_utterance(processed_wav)
|
115 |
+
|
116 |
+
torch.cuda.empty_cache()
|
117 |
+
ref_feat = torch.from_numpy(embed).unsqueeze(0)
|
118 |
+
|
119 |
+
feats[file] = ref_feat
|
120 |
+
|
121 |
+
telapsed = time.time() - tstart
|
122 |
+
|
123 |
+
if idx % print_interval == 0:
|
124 |
+
sys.stdout.write("\rReading %d of %d: %.2f Hz, embedding size %d"%(idx,len(setfiles),idx/telapsed,ref_feat.size()[1]));
|
125 |
+
|
126 |
+
print('')
|
127 |
+
all_scores = [];
|
128 |
+
all_labels = [];
|
129 |
+
all_trials = [];
|
130 |
+
tstart = time.time()
|
131 |
+
|
132 |
+
## Read files and compute all scores
|
133 |
+
for idx, line in enumerate(lines):
|
134 |
+
|
135 |
+
data = line.split();
|
136 |
+
## Append random label if missing
|
137 |
+
if len(data) == 2: data = [random.randint(0,1)] + data
|
138 |
+
|
139 |
+
ref_feat = feats[data[1]]
|
140 |
+
com_feat = feats[data[2]]
|
141 |
+
ref_feat = ref_feat.cuda()
|
142 |
+
com_feat = com_feat.cuda()
|
143 |
+
# normalize feats
|
144 |
+
ref_feat = F.normalize(ref_feat, p=2, dim=1)
|
145 |
+
com_feat = F.normalize(com_feat, p=2, dim=1)
|
146 |
+
|
147 |
+
dist = F.pairwise_distance(ref_feat.unsqueeze(-1), com_feat.unsqueeze(-1)).detach().cpu().numpy();
|
148 |
+
|
149 |
+
score = -1 * numpy.mean(dist);
|
150 |
+
|
151 |
+
all_scores.append(score);
|
152 |
+
all_labels.append(int(data[0]));
|
153 |
+
all_trials.append(data[1]+" "+data[2])
|
154 |
+
|
155 |
+
if idx % print_interval == 0:
|
156 |
+
telapsed = time.time() - tstart
|
157 |
+
sys.stdout.write("\rComputing %d of %d: %.2f Hz"%(idx,len(lines),idx/telapsed));
|
158 |
+
sys.stdout.flush();
|
159 |
+
|
160 |
+
print('\n')
|
161 |
+
|
162 |
+
return (all_scores, all_labels, all_trials);
|
163 |
+
|
164 |
+
|
165 |
+
|
166 |
+
if __name__ == '__main__':
|
167 |
+
|
168 |
+
parser = argparse.ArgumentParser("baseline")
|
169 |
+
parser.add_argument("--data_root", type=str, help="", required=True)
|
170 |
+
parser.add_argument("--list", type=str, help="", required=True)
|
171 |
+
parser.add_argument("--model_dir", type=str, help="model parameters for AudioEncoder", required=True)
|
172 |
+
|
173 |
+
args = parser.parse_args()
|
174 |
+
|
175 |
+
|
176 |
+
# Load the models one by one.
|
177 |
+
print("Preparing the encoder...")
|
178 |
+
# encoder.load_model(Path(args.model_dir))
|
179 |
+
print("Insert the wav file name...")
|
180 |
+
voice_encoder = VoiceEncoder().cuda()
|
181 |
+
|
182 |
+
sc, lab, trials = evaluateFromList(args.list, print_interval=100, test_path=args.data_root)
|
183 |
+
result = tuneThresholdfromScore(sc, lab, [1, 0.1]);
|
184 |
+
print('EER %2.4f'%result[1])
|
data_gen/tts/txt_processors/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from . import en
|
data_gen/tts/txt_processors/base_text_processor.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from data_gen.tts.data_gen_utils import is_sil_phoneme
|
2 |
+
|
3 |
+
REGISTERED_TEXT_PROCESSORS = {}
|
4 |
+
|
5 |
+
def register_txt_processors(name):
|
6 |
+
def _f(cls):
|
7 |
+
REGISTERED_TEXT_PROCESSORS[name] = cls
|
8 |
+
return cls
|
9 |
+
|
10 |
+
return _f
|
11 |
+
|
12 |
+
|
13 |
+
def get_txt_processor_cls(name):
|
14 |
+
return REGISTERED_TEXT_PROCESSORS.get(name, None)
|
15 |
+
|
16 |
+
|
17 |
+
class BaseTxtProcessor:
|
18 |
+
@staticmethod
|
19 |
+
def sp_phonemes():
|
20 |
+
return ['|']
|
21 |
+
|
22 |
+
@classmethod
|
23 |
+
def process(cls, txt, preprocess_args):
|
24 |
+
raise NotImplementedError
|
25 |
+
|
26 |
+
@classmethod
|
27 |
+
def postprocess(cls, txt_struct, preprocess_args):
|
28 |
+
# remove sil phoneme in head and tail
|
29 |
+
while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
|
30 |
+
txt_struct = txt_struct[1:]
|
31 |
+
while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
|
32 |
+
txt_struct = txt_struct[:-1]
|
33 |
+
if preprocess_args['with_phsep']:
|
34 |
+
txt_struct = cls.add_bdr(txt_struct)
|
35 |
+
if preprocess_args['add_eos_bos']:
|
36 |
+
txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
|
37 |
+
return txt_struct
|
38 |
+
|
39 |
+
@classmethod
|
40 |
+
def add_bdr(cls, txt_struct):
|
41 |
+
txt_struct_ = []
|
42 |
+
for i, ts in enumerate(txt_struct):
|
43 |
+
txt_struct_.append(ts)
|
44 |
+
if i != len(txt_struct) - 1 and \
|
45 |
+
not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
|
46 |
+
txt_struct_.append(['|', ['|']])
|
47 |
+
return txt_struct_
|
data_gen/tts/txt_processors/en.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import unicodedata
|
3 |
+
|
4 |
+
from g2p_en import G2p
|
5 |
+
from g2p_en.expand import normalize_numbers
|
6 |
+
from nltk import pos_tag
|
7 |
+
from nltk.tokenize import TweetTokenizer
|
8 |
+
|
9 |
+
from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
|
10 |
+
from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS
|
11 |
+
|
12 |
+
class EnG2p(G2p):
|
13 |
+
word_tokenize = TweetTokenizer().tokenize
|
14 |
+
|
15 |
+
def __call__(self, text):
|
16 |
+
# preprocessing
|
17 |
+
words = EnG2p.word_tokenize(text)
|
18 |
+
tokens = pos_tag(words) # tuples of (word, tag)
|
19 |
+
|
20 |
+
# steps
|
21 |
+
prons = []
|
22 |
+
for word, pos in tokens:
|
23 |
+
if re.search("[a-z]", word) is None:
|
24 |
+
pron = [word]
|
25 |
+
|
26 |
+
elif word in self.homograph2features: # Check homograph
|
27 |
+
pron1, pron2, pos1 = self.homograph2features[word]
|
28 |
+
if pos.startswith(pos1):
|
29 |
+
pron = pron1
|
30 |
+
else:
|
31 |
+
pron = pron2
|
32 |
+
elif word in self.cmu: # lookup CMU dict
|
33 |
+
pron = self.cmu[word][0]
|
34 |
+
else: # predict for oov
|
35 |
+
pron = self.predict(word)
|
36 |
+
|
37 |
+
prons.extend(pron)
|
38 |
+
prons.extend([" "])
|
39 |
+
|
40 |
+
return prons[:-1]
|
41 |
+
|
42 |
+
|
43 |
+
@register_txt_processors('en')
|
44 |
+
class TxtProcessor(BaseTxtProcessor):
|
45 |
+
g2p = EnG2p()
|
46 |
+
|
47 |
+
@staticmethod
|
48 |
+
def preprocess_text(text):
|
49 |
+
text = normalize_numbers(text)
|
50 |
+
text = ''.join(char for char in unicodedata.normalize('NFD', text)
|
51 |
+
if unicodedata.category(char) != 'Mn') # Strip accents
|
52 |
+
text = text.lower()
|
53 |
+
text = re.sub("[\'\"()]+", "", text)
|
54 |
+
text = re.sub("[-]+", " ", text)
|
55 |
+
text = re.sub(f"[^ a-z{PUNCS}]", "", text)
|
56 |
+
text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> !
|
57 |
+
text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
|
58 |
+
text = text.replace("i.e.", "that is")
|
59 |
+
text = text.replace("i.e.", "that is")
|
60 |
+
text = text.replace("etc.", "etc")
|
61 |
+
text = re.sub(f"([{PUNCS}])", r" \1 ", text)
|
62 |
+
text = re.sub(rf"\s+", r" ", text)
|
63 |
+
return text
|
64 |
+
|
65 |
+
@classmethod
|
66 |
+
def process(cls, txt, preprocess_args):
|
67 |
+
txt = cls.preprocess_text(txt).strip()
|
68 |
+
phs = cls.g2p(txt)
|
69 |
+
txt_struct = [[w, []] for w in txt.split(" ")]
|
70 |
+
i_word = 0
|
71 |
+
for p in phs:
|
72 |
+
if p == ' ':
|
73 |
+
i_word += 1
|
74 |
+
else:
|
75 |
+
txt_struct[i_word][1].append(p)
|
76 |
+
txt_struct = cls.postprocess(txt_struct, preprocess_args)
|
77 |
+
return txt_struct, txt
|
data_gen/tts/wav_processors/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from . import base_processor
|
2 |
+
from . import common_processors
|
data_gen/tts/wav_processors/base_processor.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
REGISTERED_WAV_PROCESSORS = {}
|
2 |
+
|
3 |
+
|
4 |
+
def register_wav_processors(name):
|
5 |
+
def _f(cls):
|
6 |
+
REGISTERED_WAV_PROCESSORS[name] = cls
|
7 |
+
return cls
|
8 |
+
|
9 |
+
return _f
|
10 |
+
|
11 |
+
|
12 |
+
def get_wav_processor_cls(name):
|
13 |
+
return REGISTERED_WAV_PROCESSORS.get(name, None)
|
14 |
+
|
15 |
+
|
16 |
+
class BaseWavProcessor:
|
17 |
+
@property
|
18 |
+
def name(self):
|
19 |
+
raise NotImplementedError
|
20 |
+
|
21 |
+
def output_fn(self, input_fn):
|
22 |
+
return f'{input_fn[:-4]}_{self.name}.wav'
|
23 |
+
|
24 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
25 |
+
raise NotImplementedError
|
data_gen/tts/wav_processors/common_processors.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import librosa
|
4 |
+
import numpy as np
|
5 |
+
from data_gen.tts.wav_processors.base_processor import BaseWavProcessor, register_wav_processors
|
6 |
+
from data_gen.tts.data_gen_utils import trim_long_silences
|
7 |
+
from utils.audio import save_wav
|
8 |
+
from utils.rnnoise import rnnoise
|
9 |
+
from utils.hparams import hparams
|
10 |
+
|
11 |
+
|
12 |
+
@register_wav_processors(name='sox_to_wav')
|
13 |
+
class ConvertToWavProcessor(BaseWavProcessor):
|
14 |
+
@property
|
15 |
+
def name(self):
|
16 |
+
return 'ToWav'
|
17 |
+
|
18 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
19 |
+
if input_fn[-4:] == '.wav':
|
20 |
+
return input_fn, sr
|
21 |
+
else:
|
22 |
+
output_fn = self.output_fn(input_fn)
|
23 |
+
subprocess.check_call(f'sox -v 0.95 "{input_fn}" -t wav "{output_fn}"', shell=True)
|
24 |
+
return output_fn, sr
|
25 |
+
|
26 |
+
|
27 |
+
@register_wav_processors(name='sox_resample')
|
28 |
+
class ResampleProcessor(BaseWavProcessor):
|
29 |
+
@property
|
30 |
+
def name(self):
|
31 |
+
return 'Resample'
|
32 |
+
|
33 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
34 |
+
output_fn = self.output_fn(input_fn)
|
35 |
+
sr_file = librosa.core.get_samplerate(input_fn)
|
36 |
+
if sr != sr_file:
|
37 |
+
subprocess.check_call(f'sox -v 0.95 "{input_fn}" -r{sr} "{output_fn}"', shell=True)
|
38 |
+
y, _ = librosa.core.load(input_fn, sr=sr)
|
39 |
+
y, _ = librosa.effects.trim(y)
|
40 |
+
save_wav(y, output_fn, sr)
|
41 |
+
return output_fn, sr
|
42 |
+
else:
|
43 |
+
return input_fn, sr
|
44 |
+
|
45 |
+
|
46 |
+
@register_wav_processors(name='trim_sil')
|
47 |
+
class TrimSILProcessor(BaseWavProcessor):
|
48 |
+
@property
|
49 |
+
def name(self):
|
50 |
+
return 'TrimSIL'
|
51 |
+
|
52 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
53 |
+
output_fn = self.output_fn(input_fn)
|
54 |
+
y, _ = librosa.core.load(input_fn, sr=sr)
|
55 |
+
y, _ = librosa.effects.trim(y)
|
56 |
+
save_wav(y, output_fn, sr)
|
57 |
+
return output_fn
|
58 |
+
|
59 |
+
|
60 |
+
@register_wav_processors(name='trim_all_sil')
|
61 |
+
class TrimAllSILProcessor(BaseWavProcessor):
|
62 |
+
@property
|
63 |
+
def name(self):
|
64 |
+
return 'TrimSIL'
|
65 |
+
|
66 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
67 |
+
output_fn = self.output_fn(input_fn)
|
68 |
+
y, audio_mask, _ = trim_long_silences(
|
69 |
+
input_fn, vad_max_silence_length=preprocess_args.get('vad_max_silence_length', 12))
|
70 |
+
save_wav(y, output_fn, sr)
|
71 |
+
if preprocess_args['save_sil_mask']:
|
72 |
+
os.makedirs(f'{processed_dir}/sil_mask', exist_ok=True)
|
73 |
+
np.save(f'{processed_dir}/sil_mask/{item_name}.npy', audio_mask)
|
74 |
+
return output_fn, sr
|
75 |
+
|
76 |
+
|
77 |
+
@register_wav_processors(name='denoise')
|
78 |
+
class DenoiseProcessor(BaseWavProcessor):
|
79 |
+
@property
|
80 |
+
def name(self):
|
81 |
+
return 'Denoise'
|
82 |
+
|
83 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
84 |
+
output_fn = self.output_fn(input_fn)
|
85 |
+
rnnoise(input_fn, output_fn, out_sample_rate=sr)
|
86 |
+
return output_fn, sr
|
egs/datasets/audio/emotion/base_text2mel.yaml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
raw_data_dir: 'data/raw/ESD'
|
2 |
+
processed_data_dir: 'data/processed/emotion'
|
3 |
+
binary_data_dir: 'data/binary/emotion'
|
4 |
+
pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
|
5 |
+
audio_sample_rate: 16000
|
6 |
+
binarization_args:
|
7 |
+
shuffle: true
|
8 |
+
binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
|
9 |
+
use_spk_id: true
|
10 |
+
test_num: 200
|
11 |
+
num_spk: 10
|
12 |
+
pitch_type: frame
|
13 |
+
min_frames: 128
|
14 |
+
num_test_samples: 30
|
15 |
+
mel_loss: "ssim:0.5|l1:0.5"
|
16 |
+
vocoder_ckpt: ''
|
17 |
+
use_emotion: true
|
egs/datasets/audio/emotion/pre_align.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from data_gen.tts.base_preprocess import BasePreprocessor
|
4 |
+
import glob
|
5 |
+
import re
|
6 |
+
|
7 |
+
class EmoPreAlign(BasePreprocessor):
|
8 |
+
|
9 |
+
def meta_data(self):
|
10 |
+
spks = ['0012', '0011', '0013', '0014', '0015', '0016', '0017', '0018', '0019', '0020']
|
11 |
+
pattern = re.compile('[\t\n ]+')
|
12 |
+
for spk in spks:
|
13 |
+
for line in open(f"{self.raw_data_dir}/{spk}/{spk}.txt", 'r'): # 打开文件
|
14 |
+
line = re.sub(pattern, ' ', line)
|
15 |
+
if line == ' ': continue
|
16 |
+
split_ = line.split(' ')
|
17 |
+
txt = ' '.join(split_[1: -2])
|
18 |
+
item_name = split_[0]
|
19 |
+
emotion = split_[-2]
|
20 |
+
wav_fn = f'{self.raw_data_dir}/{spk}/{emotion}/{item_name}.wav'
|
21 |
+
yield item_name, wav_fn, txt, spk, emotion
|
22 |
+
|
23 |
+
|
24 |
+
if __name__ == "__main__":
|
25 |
+
EmoPreAlign().process()
|
egs/datasets/audio/libritts/base_text2mel.yaml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
raw_data_dir: 'data/raw/LibriTTS'
|
2 |
+
processed_data_dir: 'data/processed/libritts'
|
3 |
+
binary_data_dir: 'data/binary/libritts'
|
4 |
+
pre_align_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
|
5 |
+
binarization_args:
|
6 |
+
shuffle: true
|
7 |
+
use_spk_id: true
|
8 |
+
test_num: 200
|
9 |
+
num_spk: 2320
|
10 |
+
pitch_type: frame
|
11 |
+
min_frames: 128
|
12 |
+
num_test_samples: 30
|
13 |
+
mel_loss: "ssim:0.5|l1:0.5"
|
14 |
+
vocoder_ckpt: ''
|
egs/datasets/audio/libritts/fs2.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/fs2.yaml
|
3 |
+
- ./base_text2mel.yaml
|
egs/datasets/audio/libritts/pre_align.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from data_gen.tts.base_preprocess import BasePreprocessor
|
4 |
+
import glob
|
5 |
+
|
6 |
+
|
7 |
+
class LibrittsPreAlign(BasePreprocessor):
|
8 |
+
def meta_data(self):
|
9 |
+
wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*/*.wav'))
|
10 |
+
for wav_fn in wav_fns:
|
11 |
+
item_name = os.path.basename(wav_fn)[:-4]
|
12 |
+
txt_fn = f'{wav_fn[:-4]}.normalized.txt'
|
13 |
+
with open(txt_fn, 'r') as f:
|
14 |
+
txt = f.readlines()
|
15 |
+
f.close()
|
16 |
+
spk = item_name.split("_")[0]
|
17 |
+
yield item_name, wav_fn, txt, spk
|
18 |
+
|
19 |
+
|
20 |
+
if __name__ == "__main__":
|
21 |
+
LibrittsPreAlign().process()
|
egs/datasets/audio/libritts/pwg.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: egs/egs_bases/tts/vocoder/pwg.yaml
|
2 |
+
raw_data_dir: 'data/raw/LibriTTS'
|
3 |
+
processed_data_dir: 'data/processed/libritts'
|
4 |
+
binary_data_dir: 'data/binary/libritts_wav'
|
5 |
+
generator_params:
|
6 |
+
kernel_size: 5
|
7 |
+
num_spk: 400
|
8 |
+
max_samples: 20480
|
egs/datasets/audio/lj/base_mel2wav.yaml
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
raw_data_dir: 'data/raw/LJSpeech-1.1'
|
2 |
+
processed_data_dir: 'data/processed/ljspeech'
|
3 |
+
binary_data_dir: 'data/binary/ljspeech_wav'
|
4 |
+
binarization_args:
|
5 |
+
with_spk_embed: false
|
egs/datasets/audio/lj/pre_align.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from data_gen.tts.base_preprocess import BasePreprocessor
|
2 |
+
|
3 |
+
|
4 |
+
class LJPreAlign(BasePreprocessor):
|
5 |
+
def meta_data(self):
|
6 |
+
for l in open(f'{self.raw_data_dir}/metadata.csv').readlines():
|
7 |
+
item_name, _, txt = l.strip().split("|")
|
8 |
+
wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav"
|
9 |
+
yield item_name, wav_fn, txt, 'SPK1'
|
10 |
+
|
11 |
+
|
12 |
+
if __name__ == "__main__":
|
13 |
+
LJPreAlign().process()
|
egs/datasets/audio/lj/pwg.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/vocoder/pwg.yaml
|
3 |
+
- ./base_mel2wav.yaml
|
egs/datasets/audio/vctk/base_mel2wav.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
raw_data_dir: 'data/raw/VCTK-Corpus'
|
2 |
+
processed_data_dir: 'data/processed/vctk'
|
3 |
+
binary_data_dir: 'data/binary/vctk_wav'
|
egs/datasets/audio/vctk/fs2.yaml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/fs2.yaml
|
3 |
+
raw_data_dir: 'data/raw/VCTK-Corpus'
|
4 |
+
processed_data_dir: 'data/processed/vctk'
|
5 |
+
binary_data_dir: 'data/binary/vctk'
|
6 |
+
pre_align_cls: egs.datasets.audio.vctk.pre_align.VCTKPreAlign
|
7 |
+
use_spk_id: true
|
8 |
+
test_num: 200
|
9 |
+
num_spk: 400
|
10 |
+
binarization_args:
|
11 |
+
shuffle: true
|
12 |
+
trim_eos_bos: true
|
egs/datasets/audio/vctk/pre_align.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from data_gen.tts.base_pre_align import BasePreAlign
|
4 |
+
import glob
|
5 |
+
|
6 |
+
|
7 |
+
class VCTKPreAlign(BasePreAlign):
|
8 |
+
def meta_data(self):
|
9 |
+
wav_fns = glob.glob(f'{self.raw_data_dir}/wav48/*/*.wav')
|
10 |
+
for wav_fn in wav_fns:
|
11 |
+
item_name = os.path.basename(wav_fn)[:-4]
|
12 |
+
spk = item_name.split("_")[0]
|
13 |
+
txt_fn = wav_fn.split("/")
|
14 |
+
txt_fn[-1] = f'{item_name}.txt'
|
15 |
+
txt_fn[-3] = f'txt'
|
16 |
+
txt_fn = "/".join(txt_fn)
|
17 |
+
if os.path.exists(txt_fn) and os.path.exists(wav_fn):
|
18 |
+
yield item_name, wav_fn, (self.load_txt, txt_fn), spk
|
19 |
+
|
20 |
+
|
21 |
+
if __name__ == "__main__":
|
22 |
+
VCTKPreAlign().process()
|
egs/datasets/audio/vctk/pwg.yaml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/vocoder/pwg.yaml
|
3 |
+
- ./base_mel2wav.yaml
|
4 |
+
|
5 |
+
num_spk: 400
|
6 |
+
max_samples: 20480
|
egs/egs_bases/config_base.yaml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# task
|
2 |
+
binary_data_dir: ''
|
3 |
+
work_dir: '' # experiment directory.
|
4 |
+
infer: false # inference
|
5 |
+
amp: false
|
6 |
+
seed: 1234
|
7 |
+
debug: false
|
8 |
+
save_codes: []
|
9 |
+
# - configs
|
10 |
+
# - modules
|
11 |
+
# - tasks
|
12 |
+
# - utils
|
13 |
+
# - usr
|
14 |
+
|
15 |
+
#############
|
16 |
+
# dataset
|
17 |
+
#############
|
18 |
+
ds_workers: 1
|
19 |
+
test_num: 100
|
20 |
+
endless_ds: false
|
21 |
+
sort_by_len: true
|
22 |
+
|
23 |
+
#########
|
24 |
+
# train and eval
|
25 |
+
#########
|
26 |
+
print_nan_grads: false
|
27 |
+
load_ckpt: ''
|
28 |
+
save_best: true
|
29 |
+
num_ckpt_keep: 3
|
30 |
+
clip_grad_norm: 0
|
31 |
+
accumulate_grad_batches: 1
|
32 |
+
tb_log_interval: 100
|
33 |
+
num_sanity_val_steps: 5 # steps of validation at the beginning
|
34 |
+
check_val_every_n_epoch: 10
|
35 |
+
val_check_interval: 2000
|
36 |
+
valid_monitor_key: 'val_loss'
|
37 |
+
valid_monitor_mode: 'min'
|
38 |
+
max_epochs: 1000
|
39 |
+
max_updates: 1000000
|
40 |
+
max_tokens: 31250
|
41 |
+
max_sentences: 100000
|
42 |
+
max_valid_tokens: -1
|
43 |
+
max_valid_sentences: -1
|
44 |
+
test_input_dir: ''
|
45 |
+
resume_from_checkpoint: 0
|
46 |
+
rename_tmux: true
|