Spaces:
Running
Running
Delete synthesizer
Browse files- synthesizer/LICENSE.txt +0 -24
- synthesizer/__init__.py +0 -1
- synthesizer/audio.py +0 -206
- synthesizer/hparams.py +0 -92
- synthesizer/inference.py +0 -165
- synthesizer/models/tacotron.py +0 -519
- synthesizer/preprocess.py +0 -258
- synthesizer/synthesize.py +0 -92
- synthesizer/synthesizer_dataset.py +0 -92
- synthesizer/train.py +0 -258
- synthesizer/utils/__init__.py +0 -45
- synthesizer/utils/_cmudict.py +0 -62
- synthesizer/utils/cleaners.py +0 -88
- synthesizer/utils/numbers.py +0 -69
- synthesizer/utils/plot.py +0 -82
- synthesizer/utils/symbols.py +0 -17
- synthesizer/utils/text.py +0 -75
synthesizer/LICENSE.txt
DELETED
@@ -1,24 +0,0 @@
|
|
1 |
-
MIT License
|
2 |
-
|
3 |
-
Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
|
4 |
-
Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
|
5 |
-
Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
|
6 |
-
Modified work Copyright (c) 2020 blue-fish (https://github.com/blue-fish)
|
7 |
-
|
8 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
9 |
-
of this software and associated documentation files (the "Software"), to deal
|
10 |
-
in the Software without restriction, including without limitation the rights
|
11 |
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12 |
-
copies of the Software, and to permit persons to whom the Software is
|
13 |
-
furnished to do so, subject to the following conditions:
|
14 |
-
|
15 |
-
The above copyright notice and this permission notice shall be included in all
|
16 |
-
copies or substantial portions of the Software.
|
17 |
-
|
18 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
19 |
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
20 |
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
21 |
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
22 |
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
23 |
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
24 |
-
SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
#
|
|
|
|
synthesizer/audio.py
DELETED
@@ -1,206 +0,0 @@
|
|
1 |
-
import librosa
|
2 |
-
import librosa.filters
|
3 |
-
import numpy as np
|
4 |
-
from scipy import signal
|
5 |
-
from scipy.io import wavfile
|
6 |
-
import soundfile as sf
|
7 |
-
|
8 |
-
|
9 |
-
def load_wav(path, sr):
|
10 |
-
return librosa.core.load(path, sr=sr)[0]
|
11 |
-
|
12 |
-
def save_wav(wav, path, sr):
|
13 |
-
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
|
14 |
-
#proposed by @dsmiller
|
15 |
-
wavfile.write(path, sr, wav.astype(np.int16))
|
16 |
-
|
17 |
-
def save_wavenet_wav(wav, path, sr):
|
18 |
-
sf.write(path, wav.astype(np.float32), sr)
|
19 |
-
|
20 |
-
def preemphasis(wav, k, preemphasize=True):
|
21 |
-
if preemphasize:
|
22 |
-
return signal.lfilter([1, -k], [1], wav)
|
23 |
-
return wav
|
24 |
-
|
25 |
-
def inv_preemphasis(wav, k, inv_preemphasize=True):
|
26 |
-
if inv_preemphasize:
|
27 |
-
return signal.lfilter([1], [1, -k], wav)
|
28 |
-
return wav
|
29 |
-
|
30 |
-
#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
|
31 |
-
def start_and_end_indices(quantized, silence_threshold=2):
|
32 |
-
for start in range(quantized.size):
|
33 |
-
if abs(quantized[start] - 127) > silence_threshold:
|
34 |
-
break
|
35 |
-
for end in range(quantized.size - 1, 1, -1):
|
36 |
-
if abs(quantized[end] - 127) > silence_threshold:
|
37 |
-
break
|
38 |
-
|
39 |
-
assert abs(quantized[start] - 127) > silence_threshold
|
40 |
-
assert abs(quantized[end] - 127) > silence_threshold
|
41 |
-
|
42 |
-
return start, end
|
43 |
-
|
44 |
-
def get_hop_size(hparams):
|
45 |
-
hop_size = hparams.hop_size
|
46 |
-
if hop_size is None:
|
47 |
-
assert hparams.frame_shift_ms is not None
|
48 |
-
hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
|
49 |
-
return hop_size
|
50 |
-
|
51 |
-
def linearspectrogram(wav, hparams):
|
52 |
-
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
|
53 |
-
S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
|
54 |
-
|
55 |
-
if hparams.signal_normalization:
|
56 |
-
return _normalize(S, hparams)
|
57 |
-
return S
|
58 |
-
|
59 |
-
def melspectrogram(wav, hparams):
|
60 |
-
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
|
61 |
-
S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
|
62 |
-
|
63 |
-
if hparams.signal_normalization:
|
64 |
-
return _normalize(S, hparams)
|
65 |
-
return S
|
66 |
-
|
67 |
-
def inv_linear_spectrogram(linear_spectrogram, hparams):
|
68 |
-
"""Converts linear spectrogram to waveform using librosa"""
|
69 |
-
if hparams.signal_normalization:
|
70 |
-
D = _denormalize(linear_spectrogram, hparams)
|
71 |
-
else:
|
72 |
-
D = linear_spectrogram
|
73 |
-
|
74 |
-
S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
|
75 |
-
|
76 |
-
if hparams.use_lws:
|
77 |
-
processor = _lws_processor(hparams)
|
78 |
-
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
|
79 |
-
y = processor.istft(D).astype(np.float32)
|
80 |
-
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
|
81 |
-
else:
|
82 |
-
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
|
83 |
-
|
84 |
-
def inv_mel_spectrogram(mel_spectrogram, hparams):
|
85 |
-
"""Converts mel spectrogram to waveform using librosa"""
|
86 |
-
if hparams.signal_normalization:
|
87 |
-
D = _denormalize(mel_spectrogram, hparams)
|
88 |
-
else:
|
89 |
-
D = mel_spectrogram
|
90 |
-
|
91 |
-
S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear
|
92 |
-
|
93 |
-
if hparams.use_lws:
|
94 |
-
processor = _lws_processor(hparams)
|
95 |
-
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
|
96 |
-
y = processor.istft(D).astype(np.float32)
|
97 |
-
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
|
98 |
-
else:
|
99 |
-
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
|
100 |
-
|
101 |
-
def _lws_processor(hparams):
|
102 |
-
import lws
|
103 |
-
return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
|
104 |
-
|
105 |
-
def _griffin_lim(S, hparams):
|
106 |
-
"""librosa implementation of Griffin-Lim
|
107 |
-
Based on https://github.com/librosa/librosa/issues/434
|
108 |
-
"""
|
109 |
-
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
110 |
-
S_complex = np.abs(S).astype(np.complex)
|
111 |
-
y = _istft(S_complex * angles, hparams)
|
112 |
-
for i in range(hparams.griffin_lim_iters):
|
113 |
-
angles = np.exp(1j * np.angle(_stft(y, hparams)))
|
114 |
-
y = _istft(S_complex * angles, hparams)
|
115 |
-
return y
|
116 |
-
|
117 |
-
def _stft(y, hparams):
|
118 |
-
if hparams.use_lws:
|
119 |
-
return _lws_processor(hparams).stft(y).T
|
120 |
-
else:
|
121 |
-
return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
|
122 |
-
|
123 |
-
def _istft(y, hparams):
|
124 |
-
return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
|
125 |
-
|
126 |
-
##########################################################
|
127 |
-
#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
|
128 |
-
def num_frames(length, fsize, fshift):
|
129 |
-
"""Compute number of time frames of spectrogram
|
130 |
-
"""
|
131 |
-
pad = (fsize - fshift)
|
132 |
-
if length % fshift == 0:
|
133 |
-
M = (length + pad * 2 - fsize) // fshift + 1
|
134 |
-
else:
|
135 |
-
M = (length + pad * 2 - fsize) // fshift + 2
|
136 |
-
return M
|
137 |
-
|
138 |
-
|
139 |
-
def pad_lr(x, fsize, fshift):
|
140 |
-
"""Compute left and right padding
|
141 |
-
"""
|
142 |
-
M = num_frames(len(x), fsize, fshift)
|
143 |
-
pad = (fsize - fshift)
|
144 |
-
T = len(x) + 2 * pad
|
145 |
-
r = (M - 1) * fshift + fsize - T
|
146 |
-
return pad, pad + r
|
147 |
-
##########################################################
|
148 |
-
#Librosa correct padding
|
149 |
-
def librosa_pad_lr(x, fsize, fshift):
|
150 |
-
return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
|
151 |
-
|
152 |
-
# Conversions
|
153 |
-
_mel_basis = None
|
154 |
-
_inv_mel_basis = None
|
155 |
-
|
156 |
-
def _linear_to_mel(spectogram, hparams):
|
157 |
-
global _mel_basis
|
158 |
-
if _mel_basis is None:
|
159 |
-
_mel_basis = _build_mel_basis(hparams)
|
160 |
-
return np.dot(_mel_basis, spectogram)
|
161 |
-
|
162 |
-
def _mel_to_linear(mel_spectrogram, hparams):
|
163 |
-
global _inv_mel_basis
|
164 |
-
if _inv_mel_basis is None:
|
165 |
-
_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
|
166 |
-
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
|
167 |
-
|
168 |
-
def _build_mel_basis(hparams):
|
169 |
-
assert hparams.fmax <= hparams.sample_rate // 2
|
170 |
-
return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
|
171 |
-
fmin=hparams.fmin, fmax=hparams.fmax)
|
172 |
-
|
173 |
-
def _amp_to_db(x, hparams):
|
174 |
-
min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
|
175 |
-
return 20 * np.log10(np.maximum(min_level, x))
|
176 |
-
|
177 |
-
def _db_to_amp(x):
|
178 |
-
return np.power(10.0, (x) * 0.05)
|
179 |
-
|
180 |
-
def _normalize(S, hparams):
|
181 |
-
if hparams.allow_clipping_in_normalization:
|
182 |
-
if hparams.symmetric_mels:
|
183 |
-
return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
|
184 |
-
-hparams.max_abs_value, hparams.max_abs_value)
|
185 |
-
else:
|
186 |
-
return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
|
187 |
-
|
188 |
-
assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
|
189 |
-
if hparams.symmetric_mels:
|
190 |
-
return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
|
191 |
-
else:
|
192 |
-
return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
|
193 |
-
|
194 |
-
def _denormalize(D, hparams):
|
195 |
-
if hparams.allow_clipping_in_normalization:
|
196 |
-
if hparams.symmetric_mels:
|
197 |
-
return (((np.clip(D, -hparams.max_abs_value,
|
198 |
-
hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
|
199 |
-
+ hparams.min_level_db)
|
200 |
-
else:
|
201 |
-
return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
|
202 |
-
|
203 |
-
if hparams.symmetric_mels:
|
204 |
-
return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
|
205 |
-
else:
|
206 |
-
return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/hparams.py
DELETED
@@ -1,92 +0,0 @@
|
|
1 |
-
import ast
|
2 |
-
import pprint
|
3 |
-
|
4 |
-
class HParams(object):
|
5 |
-
def __init__(self, **kwargs): self.__dict__.update(kwargs)
|
6 |
-
def __setitem__(self, key, value): setattr(self, key, value)
|
7 |
-
def __getitem__(self, key): return getattr(self, key)
|
8 |
-
def __repr__(self): return pprint.pformat(self.__dict__)
|
9 |
-
|
10 |
-
def parse(self, string):
|
11 |
-
# Overrides hparams from a comma-separated string of name=value pairs
|
12 |
-
if len(string) > 0:
|
13 |
-
overrides = [s.split("=") for s in string.split(",")]
|
14 |
-
keys, values = zip(*overrides)
|
15 |
-
keys = list(map(str.strip, keys))
|
16 |
-
values = list(map(str.strip, values))
|
17 |
-
for k in keys:
|
18 |
-
self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
|
19 |
-
return self
|
20 |
-
|
21 |
-
hparams = HParams(
|
22 |
-
### Signal Processing (used in both synthesizer and vocoder)
|
23 |
-
sample_rate = 16000,
|
24 |
-
n_fft = 800,
|
25 |
-
num_mels = 80,
|
26 |
-
hop_size = 200, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
|
27 |
-
win_size = 800, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
|
28 |
-
fmin = 55,
|
29 |
-
min_level_db = -100,
|
30 |
-
ref_level_db = 20,
|
31 |
-
max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small.
|
32 |
-
preemphasis = 0.97, # Filter coefficient to use if preemphasize is True
|
33 |
-
preemphasize = True,
|
34 |
-
|
35 |
-
### Tacotron Text-to-Speech (TTS)
|
36 |
-
tts_embed_dims = 512, # Embedding dimension for the graphemes/phoneme inputs
|
37 |
-
tts_encoder_dims = 256,
|
38 |
-
tts_decoder_dims = 128,
|
39 |
-
tts_postnet_dims = 512,
|
40 |
-
tts_encoder_K = 5,
|
41 |
-
tts_lstm_dims = 1024,
|
42 |
-
tts_postnet_K = 5,
|
43 |
-
tts_num_highways = 4,
|
44 |
-
tts_dropout = 0.5,
|
45 |
-
tts_cleaner_names = ["english_cleaners"],
|
46 |
-
tts_stop_threshold = -3.4, # Value below which audio generation ends.
|
47 |
-
# For example, for a range of [-4, 4], this
|
48 |
-
# will terminate the sequence at the first
|
49 |
-
# frame that has all values < -3.4
|
50 |
-
|
51 |
-
### Tacotron Training
|
52 |
-
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
|
53 |
-
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
|
54 |
-
(2, 2e-4, 80_000, 12), #
|
55 |
-
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
|
56 |
-
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
|
57 |
-
(2, 1e-5, 640_000, 12)], # lr = learning rate
|
58 |
-
|
59 |
-
tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed
|
60 |
-
tts_eval_interval = 500, # Number of steps between model evaluation (sample generation)
|
61 |
-
# Set to -1 to generate after completing epoch, or 0 to disable
|
62 |
-
|
63 |
-
tts_eval_num_samples = 1, # Makes this number of samples
|
64 |
-
|
65 |
-
### Data Preprocessing
|
66 |
-
max_mel_frames = 900,
|
67 |
-
rescale = True,
|
68 |
-
rescaling_max = 0.9,
|
69 |
-
synthesis_batch_size = 16, # For vocoder preprocessing and inference.
|
70 |
-
|
71 |
-
### Mel Visualization and Griffin-Lim
|
72 |
-
signal_normalization = True,
|
73 |
-
power = 1.5,
|
74 |
-
griffin_lim_iters = 60,
|
75 |
-
|
76 |
-
### Audio processing options
|
77 |
-
fmax = 7600, # Should not exceed (sample_rate // 2)
|
78 |
-
allow_clipping_in_normalization = True, # Used when signal_normalization = True
|
79 |
-
clip_mels_length = True, # If true, discards samples exceeding max_mel_frames
|
80 |
-
use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
|
81 |
-
symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
|
82 |
-
# and [0, max_abs_value] if False
|
83 |
-
trim_silence = True, # Use with sample_rate of 16000 for best results
|
84 |
-
|
85 |
-
### SV2TTS
|
86 |
-
speaker_embedding_size = 256, # Dimension for the speaker embedding
|
87 |
-
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
|
88 |
-
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
|
89 |
-
)
|
90 |
-
|
91 |
-
def hparams_debug_string():
|
92 |
-
return str(hparams)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/inference.py
DELETED
@@ -1,165 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from synthesizer import audio
|
3 |
-
from synthesizer.hparams import hparams
|
4 |
-
from synthesizer.models.tacotron import Tacotron
|
5 |
-
from synthesizer.utils.symbols import symbols
|
6 |
-
from synthesizer.utils.text import text_to_sequence
|
7 |
-
from vocoder.display import simple_table
|
8 |
-
from pathlib import Path
|
9 |
-
from typing import Union, List
|
10 |
-
import numpy as np
|
11 |
-
import librosa
|
12 |
-
|
13 |
-
|
14 |
-
class Synthesizer:
|
15 |
-
sample_rate = hparams.sample_rate
|
16 |
-
hparams = hparams
|
17 |
-
|
18 |
-
def __init__(self, model_fpath: Path, verbose=True):
|
19 |
-
"""
|
20 |
-
The model isn't instantiated and loaded in memory until needed or until load() is called.
|
21 |
-
|
22 |
-
:param model_fpath: path to the trained model file
|
23 |
-
:param verbose: if False, prints less information when using the model
|
24 |
-
"""
|
25 |
-
self.model_fpath = model_fpath
|
26 |
-
self.verbose = verbose
|
27 |
-
|
28 |
-
# Check for GPU
|
29 |
-
if torch.cuda.is_available():
|
30 |
-
self.device = torch.device("cuda")
|
31 |
-
else:
|
32 |
-
self.device = torch.device("cpu")
|
33 |
-
if self.verbose:
|
34 |
-
print("Synthesizer using device:", self.device)
|
35 |
-
|
36 |
-
# Tacotron model will be instantiated later on first use.
|
37 |
-
self._model = None
|
38 |
-
|
39 |
-
def is_loaded(self):
|
40 |
-
"""
|
41 |
-
Whether the model is loaded in memory.
|
42 |
-
"""
|
43 |
-
return self._model is not None
|
44 |
-
|
45 |
-
def load(self):
|
46 |
-
"""
|
47 |
-
Instantiates and loads the model given the weights file that was passed in the constructor.
|
48 |
-
"""
|
49 |
-
self._model = Tacotron(embed_dims=hparams.tts_embed_dims,
|
50 |
-
num_chars=len(symbols),
|
51 |
-
encoder_dims=hparams.tts_encoder_dims,
|
52 |
-
decoder_dims=hparams.tts_decoder_dims,
|
53 |
-
n_mels=hparams.num_mels,
|
54 |
-
fft_bins=hparams.num_mels,
|
55 |
-
postnet_dims=hparams.tts_postnet_dims,
|
56 |
-
encoder_K=hparams.tts_encoder_K,
|
57 |
-
lstm_dims=hparams.tts_lstm_dims,
|
58 |
-
postnet_K=hparams.tts_postnet_K,
|
59 |
-
num_highways=hparams.tts_num_highways,
|
60 |
-
dropout=hparams.tts_dropout,
|
61 |
-
stop_threshold=hparams.tts_stop_threshold,
|
62 |
-
speaker_embedding_size=hparams.speaker_embedding_size).to(self.device)
|
63 |
-
|
64 |
-
self._model.load(self.model_fpath)
|
65 |
-
self._model.eval()
|
66 |
-
|
67 |
-
if self.verbose:
|
68 |
-
print("Loaded synthesizer \"%s\" trained to step %d" % (self.model_fpath.name, self._model.state_dict()["step"]))
|
69 |
-
|
70 |
-
def synthesize_spectrograms(self, texts: List[str],
|
71 |
-
embeddings: Union[np.ndarray, List[np.ndarray]],
|
72 |
-
return_alignments=False):
|
73 |
-
"""
|
74 |
-
Synthesizes mel spectrograms from texts and speaker embeddings.
|
75 |
-
|
76 |
-
:param texts: a list of N text prompts to be synthesized
|
77 |
-
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
|
78 |
-
:param return_alignments: if True, a matrix representing the alignments between the
|
79 |
-
characters
|
80 |
-
and each decoder output step will be returned for each spectrogram
|
81 |
-
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
|
82 |
-
sequence length of spectrogram i, and possibly the alignments.
|
83 |
-
"""
|
84 |
-
# Load the model on the first request.
|
85 |
-
if not self.is_loaded():
|
86 |
-
self.load()
|
87 |
-
|
88 |
-
# Preprocess text inputs
|
89 |
-
inputs = [text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts]
|
90 |
-
if not isinstance(embeddings, list):
|
91 |
-
embeddings = [embeddings]
|
92 |
-
|
93 |
-
# Batch inputs
|
94 |
-
batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
|
95 |
-
for i in range(0, len(inputs), hparams.synthesis_batch_size)]
|
96 |
-
batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
|
97 |
-
for i in range(0, len(embeddings), hparams.synthesis_batch_size)]
|
98 |
-
|
99 |
-
specs = []
|
100 |
-
for i, batch in enumerate(batched_inputs, 1):
|
101 |
-
if self.verbose:
|
102 |
-
print(f"\n| Generating {i}/{len(batched_inputs)}")
|
103 |
-
|
104 |
-
# Pad texts so they are all the same length
|
105 |
-
text_lens = [len(text) for text in batch]
|
106 |
-
max_text_len = max(text_lens)
|
107 |
-
chars = [pad1d(text, max_text_len) for text in batch]
|
108 |
-
chars = np.stack(chars)
|
109 |
-
|
110 |
-
# Stack speaker embeddings into 2D array for batch processing
|
111 |
-
speaker_embeds = np.stack(batched_embeds[i-1])
|
112 |
-
|
113 |
-
# Convert to tensor
|
114 |
-
chars = torch.tensor(chars).long().to(self.device)
|
115 |
-
speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
|
116 |
-
|
117 |
-
# Inference
|
118 |
-
_, mels, alignments = self._model.generate(chars, speaker_embeddings)
|
119 |
-
mels = mels.detach().cpu().numpy()
|
120 |
-
for m in mels:
|
121 |
-
# Trim silence from end of each spectrogram
|
122 |
-
while np.max(m[:, -1]) < hparams.tts_stop_threshold:
|
123 |
-
m = m[:, :-1]
|
124 |
-
specs.append(m)
|
125 |
-
|
126 |
-
if self.verbose:
|
127 |
-
print("\n\nDone.\n")
|
128 |
-
return (specs, alignments) if return_alignments else specs
|
129 |
-
|
130 |
-
@staticmethod
|
131 |
-
def load_preprocess_wav(fpath):
|
132 |
-
"""
|
133 |
-
Loads and preprocesses an audio file under the same conditions the audio files were used to
|
134 |
-
train the synthesizer.
|
135 |
-
"""
|
136 |
-
wav = librosa.load(str(fpath), hparams.sample_rate)[0]
|
137 |
-
if hparams.rescale:
|
138 |
-
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
139 |
-
return wav
|
140 |
-
|
141 |
-
@staticmethod
|
142 |
-
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
|
143 |
-
"""
|
144 |
-
Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
|
145 |
-
were fed to the synthesizer when training.
|
146 |
-
"""
|
147 |
-
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
|
148 |
-
wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
|
149 |
-
else:
|
150 |
-
wav = fpath_or_wav
|
151 |
-
|
152 |
-
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
153 |
-
return mel_spectrogram
|
154 |
-
|
155 |
-
@staticmethod
|
156 |
-
def griffin_lim(mel):
|
157 |
-
"""
|
158 |
-
Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
|
159 |
-
with the same parameters present in hparams.py.
|
160 |
-
"""
|
161 |
-
return audio.inv_mel_spectrogram(mel, hparams)
|
162 |
-
|
163 |
-
|
164 |
-
def pad1d(x, max_len, pad_value=0):
|
165 |
-
return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/models/tacotron.py
DELETED
@@ -1,519 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import numpy as np
|
3 |
-
import torch
|
4 |
-
import torch.nn as nn
|
5 |
-
import torch.nn.functional as F
|
6 |
-
from pathlib import Path
|
7 |
-
from typing import Union
|
8 |
-
|
9 |
-
|
10 |
-
class HighwayNetwork(nn.Module):
|
11 |
-
def __init__(self, size):
|
12 |
-
super().__init__()
|
13 |
-
self.W1 = nn.Linear(size, size)
|
14 |
-
self.W2 = nn.Linear(size, size)
|
15 |
-
self.W1.bias.data.fill_(0.)
|
16 |
-
|
17 |
-
def forward(self, x):
|
18 |
-
x1 = self.W1(x)
|
19 |
-
x2 = self.W2(x)
|
20 |
-
g = torch.sigmoid(x2)
|
21 |
-
y = g * F.relu(x1) + (1. - g) * x
|
22 |
-
return y
|
23 |
-
|
24 |
-
|
25 |
-
class Encoder(nn.Module):
|
26 |
-
def __init__(self, embed_dims, num_chars, encoder_dims, K, num_highways, dropout):
|
27 |
-
super().__init__()
|
28 |
-
prenet_dims = (encoder_dims, encoder_dims)
|
29 |
-
cbhg_channels = encoder_dims
|
30 |
-
self.embedding = nn.Embedding(num_chars, embed_dims)
|
31 |
-
self.pre_net = PreNet(embed_dims, fc1_dims=prenet_dims[0], fc2_dims=prenet_dims[1],
|
32 |
-
dropout=dropout)
|
33 |
-
self.cbhg = CBHG(K=K, in_channels=cbhg_channels, channels=cbhg_channels,
|
34 |
-
proj_channels=[cbhg_channels, cbhg_channels],
|
35 |
-
num_highways=num_highways)
|
36 |
-
|
37 |
-
def forward(self, x, speaker_embedding=None):
|
38 |
-
x = self.embedding(x)
|
39 |
-
x = self.pre_net(x)
|
40 |
-
x.transpose_(1, 2)
|
41 |
-
x = self.cbhg(x)
|
42 |
-
if speaker_embedding is not None:
|
43 |
-
x = self.add_speaker_embedding(x, speaker_embedding)
|
44 |
-
return x
|
45 |
-
|
46 |
-
def add_speaker_embedding(self, x, speaker_embedding):
|
47 |
-
# SV2TTS
|
48 |
-
# The input x is the encoder output and is a 3D tensor with size (batch_size, num_chars, tts_embed_dims)
|
49 |
-
# When training, speaker_embedding is also a 2D tensor with size (batch_size, speaker_embedding_size)
|
50 |
-
# (for inference, speaker_embedding is a 1D tensor with size (speaker_embedding_size))
|
51 |
-
# This concats the speaker embedding for each char in the encoder output
|
52 |
-
|
53 |
-
# Save the dimensions as human-readable names
|
54 |
-
batch_size = x.size()[0]
|
55 |
-
num_chars = x.size()[1]
|
56 |
-
|
57 |
-
if speaker_embedding.dim() == 1:
|
58 |
-
idx = 0
|
59 |
-
else:
|
60 |
-
idx = 1
|
61 |
-
|
62 |
-
# Start by making a copy of each speaker embedding to match the input text length
|
63 |
-
# The output of this has size (batch_size, num_chars * tts_embed_dims)
|
64 |
-
speaker_embedding_size = speaker_embedding.size()[idx]
|
65 |
-
e = speaker_embedding.repeat_interleave(num_chars, dim=idx)
|
66 |
-
|
67 |
-
# Reshape it and transpose
|
68 |
-
e = e.reshape(batch_size, speaker_embedding_size, num_chars)
|
69 |
-
e = e.transpose(1, 2)
|
70 |
-
|
71 |
-
# Concatenate the tiled speaker embedding with the encoder output
|
72 |
-
x = torch.cat((x, e), 2)
|
73 |
-
return x
|
74 |
-
|
75 |
-
|
76 |
-
class BatchNormConv(nn.Module):
|
77 |
-
def __init__(self, in_channels, out_channels, kernel, relu=True):
|
78 |
-
super().__init__()
|
79 |
-
self.conv = nn.Conv1d(in_channels, out_channels, kernel, stride=1, padding=kernel // 2, bias=False)
|
80 |
-
self.bnorm = nn.BatchNorm1d(out_channels)
|
81 |
-
self.relu = relu
|
82 |
-
|
83 |
-
def forward(self, x):
|
84 |
-
x = self.conv(x)
|
85 |
-
x = F.relu(x) if self.relu is True else x
|
86 |
-
return self.bnorm(x)
|
87 |
-
|
88 |
-
|
89 |
-
class CBHG(nn.Module):
|
90 |
-
def __init__(self, K, in_channels, channels, proj_channels, num_highways):
|
91 |
-
super().__init__()
|
92 |
-
|
93 |
-
# List of all rnns to call `flatten_parameters()` on
|
94 |
-
self._to_flatten = []
|
95 |
-
|
96 |
-
self.bank_kernels = [i for i in range(1, K + 1)]
|
97 |
-
self.conv1d_bank = nn.ModuleList()
|
98 |
-
for k in self.bank_kernels:
|
99 |
-
conv = BatchNormConv(in_channels, channels, k)
|
100 |
-
self.conv1d_bank.append(conv)
|
101 |
-
|
102 |
-
self.maxpool = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
|
103 |
-
|
104 |
-
self.conv_project1 = BatchNormConv(len(self.bank_kernels) * channels, proj_channels[0], 3)
|
105 |
-
self.conv_project2 = BatchNormConv(proj_channels[0], proj_channels[1], 3, relu=False)
|
106 |
-
|
107 |
-
# Fix the highway input if necessary
|
108 |
-
if proj_channels[-1] != channels:
|
109 |
-
self.highway_mismatch = True
|
110 |
-
self.pre_highway = nn.Linear(proj_channels[-1], channels, bias=False)
|
111 |
-
else:
|
112 |
-
self.highway_mismatch = False
|
113 |
-
|
114 |
-
self.highways = nn.ModuleList()
|
115 |
-
for i in range(num_highways):
|
116 |
-
hn = HighwayNetwork(channels)
|
117 |
-
self.highways.append(hn)
|
118 |
-
|
119 |
-
self.rnn = nn.GRU(channels, channels // 2, batch_first=True, bidirectional=True)
|
120 |
-
self._to_flatten.append(self.rnn)
|
121 |
-
|
122 |
-
# Avoid fragmentation of RNN parameters and associated warning
|
123 |
-
self._flatten_parameters()
|
124 |
-
|
125 |
-
def forward(self, x):
|
126 |
-
# Although we `_flatten_parameters()` on init, when using DataParallel
|
127 |
-
# the model gets replicated, making it no longer guaranteed that the
|
128 |
-
# weights are contiguous in GPU memory. Hence, we must call it again
|
129 |
-
self._flatten_parameters()
|
130 |
-
|
131 |
-
# Save these for later
|
132 |
-
residual = x
|
133 |
-
seq_len = x.size(-1)
|
134 |
-
conv_bank = []
|
135 |
-
|
136 |
-
# Convolution Bank
|
137 |
-
for conv in self.conv1d_bank:
|
138 |
-
c = conv(x) # Convolution
|
139 |
-
conv_bank.append(c[:, :, :seq_len])
|
140 |
-
|
141 |
-
# Stack along the channel axis
|
142 |
-
conv_bank = torch.cat(conv_bank, dim=1)
|
143 |
-
|
144 |
-
# dump the last padding to fit residual
|
145 |
-
x = self.maxpool(conv_bank)[:, :, :seq_len]
|
146 |
-
|
147 |
-
# Conv1d projections
|
148 |
-
x = self.conv_project1(x)
|
149 |
-
x = self.conv_project2(x)
|
150 |
-
|
151 |
-
# Residual Connect
|
152 |
-
x = x + residual
|
153 |
-
|
154 |
-
# Through the highways
|
155 |
-
x = x.transpose(1, 2)
|
156 |
-
if self.highway_mismatch is True:
|
157 |
-
x = self.pre_highway(x)
|
158 |
-
for h in self.highways: x = h(x)
|
159 |
-
|
160 |
-
# And then the RNN
|
161 |
-
x, _ = self.rnn(x)
|
162 |
-
return x
|
163 |
-
|
164 |
-
def _flatten_parameters(self):
|
165 |
-
"""Calls `flatten_parameters` on all the rnns used by the WaveRNN. Used
|
166 |
-
to improve efficiency and avoid PyTorch yelling at us."""
|
167 |
-
[m.flatten_parameters() for m in self._to_flatten]
|
168 |
-
|
169 |
-
class PreNet(nn.Module):
|
170 |
-
def __init__(self, in_dims, fc1_dims=256, fc2_dims=128, dropout=0.5):
|
171 |
-
super().__init__()
|
172 |
-
self.fc1 = nn.Linear(in_dims, fc1_dims)
|
173 |
-
self.fc2 = nn.Linear(fc1_dims, fc2_dims)
|
174 |
-
self.p = dropout
|
175 |
-
|
176 |
-
def forward(self, x):
|
177 |
-
x = self.fc1(x)
|
178 |
-
x = F.relu(x)
|
179 |
-
x = F.dropout(x, self.p, training=True)
|
180 |
-
x = self.fc2(x)
|
181 |
-
x = F.relu(x)
|
182 |
-
x = F.dropout(x, self.p, training=True)
|
183 |
-
return x
|
184 |
-
|
185 |
-
|
186 |
-
class Attention(nn.Module):
|
187 |
-
def __init__(self, attn_dims):
|
188 |
-
super().__init__()
|
189 |
-
self.W = nn.Linear(attn_dims, attn_dims, bias=False)
|
190 |
-
self.v = nn.Linear(attn_dims, 1, bias=False)
|
191 |
-
|
192 |
-
def forward(self, encoder_seq_proj, query, t):
|
193 |
-
|
194 |
-
# print(encoder_seq_proj.shape)
|
195 |
-
# Transform the query vector
|
196 |
-
query_proj = self.W(query).unsqueeze(1)
|
197 |
-
|
198 |
-
# Compute the scores
|
199 |
-
u = self.v(torch.tanh(encoder_seq_proj + query_proj))
|
200 |
-
scores = F.softmax(u, dim=1)
|
201 |
-
|
202 |
-
return scores.transpose(1, 2)
|
203 |
-
|
204 |
-
|
205 |
-
class LSA(nn.Module):
|
206 |
-
def __init__(self, attn_dim, kernel_size=31, filters=32):
|
207 |
-
super().__init__()
|
208 |
-
self.conv = nn.Conv1d(1, filters, padding=(kernel_size - 1) // 2, kernel_size=kernel_size, bias=True)
|
209 |
-
self.L = nn.Linear(filters, attn_dim, bias=False)
|
210 |
-
self.W = nn.Linear(attn_dim, attn_dim, bias=True) # Include the attention bias in this term
|
211 |
-
self.v = nn.Linear(attn_dim, 1, bias=False)
|
212 |
-
self.cumulative = None
|
213 |
-
self.attention = None
|
214 |
-
|
215 |
-
def init_attention(self, encoder_seq_proj):
|
216 |
-
device = next(self.parameters()).device # use same device as parameters
|
217 |
-
b, t, c = encoder_seq_proj.size()
|
218 |
-
self.cumulative = torch.zeros(b, t, device=device)
|
219 |
-
self.attention = torch.zeros(b, t, device=device)
|
220 |
-
|
221 |
-
def forward(self, encoder_seq_proj, query, t, chars):
|
222 |
-
|
223 |
-
if t == 0: self.init_attention(encoder_seq_proj)
|
224 |
-
|
225 |
-
processed_query = self.W(query).unsqueeze(1)
|
226 |
-
|
227 |
-
location = self.cumulative.unsqueeze(1)
|
228 |
-
processed_loc = self.L(self.conv(location).transpose(1, 2))
|
229 |
-
|
230 |
-
u = self.v(torch.tanh(processed_query + encoder_seq_proj + processed_loc))
|
231 |
-
u = u.squeeze(-1)
|
232 |
-
|
233 |
-
# Mask zero padding chars
|
234 |
-
u = u * (chars != 0).float()
|
235 |
-
|
236 |
-
# Smooth Attention
|
237 |
-
# scores = torch.sigmoid(u) / torch.sigmoid(u).sum(dim=1, keepdim=True)
|
238 |
-
scores = F.softmax(u, dim=1)
|
239 |
-
self.attention = scores
|
240 |
-
self.cumulative = self.cumulative + self.attention
|
241 |
-
|
242 |
-
return scores.unsqueeze(-1).transpose(1, 2)
|
243 |
-
|
244 |
-
|
245 |
-
class Decoder(nn.Module):
|
246 |
-
# Class variable because its value doesn't change between classes
|
247 |
-
# yet ought to be scoped by class because its a property of a Decoder
|
248 |
-
max_r = 20
|
249 |
-
def __init__(self, n_mels, encoder_dims, decoder_dims, lstm_dims,
|
250 |
-
dropout, speaker_embedding_size):
|
251 |
-
super().__init__()
|
252 |
-
self.register_buffer("r", torch.tensor(1, dtype=torch.int))
|
253 |
-
self.n_mels = n_mels
|
254 |
-
prenet_dims = (decoder_dims * 2, decoder_dims * 2)
|
255 |
-
self.prenet = PreNet(n_mels, fc1_dims=prenet_dims[0], fc2_dims=prenet_dims[1],
|
256 |
-
dropout=dropout)
|
257 |
-
self.attn_net = LSA(decoder_dims)
|
258 |
-
self.attn_rnn = nn.GRUCell(encoder_dims + prenet_dims[1] + speaker_embedding_size, decoder_dims)
|
259 |
-
self.rnn_input = nn.Linear(encoder_dims + decoder_dims + speaker_embedding_size, lstm_dims)
|
260 |
-
self.res_rnn1 = nn.LSTMCell(lstm_dims, lstm_dims)
|
261 |
-
self.res_rnn2 = nn.LSTMCell(lstm_dims, lstm_dims)
|
262 |
-
self.mel_proj = nn.Linear(lstm_dims, n_mels * self.max_r, bias=False)
|
263 |
-
self.stop_proj = nn.Linear(encoder_dims + speaker_embedding_size + lstm_dims, 1)
|
264 |
-
|
265 |
-
def zoneout(self, prev, current, p=0.1):
|
266 |
-
device = next(self.parameters()).device # Use same device as parameters
|
267 |
-
mask = torch.zeros(prev.size(), device=device).bernoulli_(p)
|
268 |
-
return prev * mask + current * (1 - mask)
|
269 |
-
|
270 |
-
def forward(self, encoder_seq, encoder_seq_proj, prenet_in,
|
271 |
-
hidden_states, cell_states, context_vec, t, chars):
|
272 |
-
|
273 |
-
# Need this for reshaping mels
|
274 |
-
batch_size = encoder_seq.size(0)
|
275 |
-
|
276 |
-
# Unpack the hidden and cell states
|
277 |
-
attn_hidden, rnn1_hidden, rnn2_hidden = hidden_states
|
278 |
-
rnn1_cell, rnn2_cell = cell_states
|
279 |
-
|
280 |
-
# PreNet for the Attention RNN
|
281 |
-
prenet_out = self.prenet(prenet_in)
|
282 |
-
|
283 |
-
# Compute the Attention RNN hidden state
|
284 |
-
attn_rnn_in = torch.cat([context_vec, prenet_out], dim=-1)
|
285 |
-
attn_hidden = self.attn_rnn(attn_rnn_in.squeeze(1), attn_hidden)
|
286 |
-
|
287 |
-
# Compute the attention scores
|
288 |
-
scores = self.attn_net(encoder_seq_proj, attn_hidden, t, chars)
|
289 |
-
|
290 |
-
# Dot product to create the context vector
|
291 |
-
context_vec = scores @ encoder_seq
|
292 |
-
context_vec = context_vec.squeeze(1)
|
293 |
-
|
294 |
-
# Concat Attention RNN output w. Context Vector & project
|
295 |
-
x = torch.cat([context_vec, attn_hidden], dim=1)
|
296 |
-
x = self.rnn_input(x)
|
297 |
-
|
298 |
-
# Compute first Residual RNN
|
299 |
-
rnn1_hidden_next, rnn1_cell = self.res_rnn1(x, (rnn1_hidden, rnn1_cell))
|
300 |
-
if self.training:
|
301 |
-
rnn1_hidden = self.zoneout(rnn1_hidden, rnn1_hidden_next)
|
302 |
-
else:
|
303 |
-
rnn1_hidden = rnn1_hidden_next
|
304 |
-
x = x + rnn1_hidden
|
305 |
-
|
306 |
-
# Compute second Residual RNN
|
307 |
-
rnn2_hidden_next, rnn2_cell = self.res_rnn2(x, (rnn2_hidden, rnn2_cell))
|
308 |
-
if self.training:
|
309 |
-
rnn2_hidden = self.zoneout(rnn2_hidden, rnn2_hidden_next)
|
310 |
-
else:
|
311 |
-
rnn2_hidden = rnn2_hidden_next
|
312 |
-
x = x + rnn2_hidden
|
313 |
-
|
314 |
-
# Project Mels
|
315 |
-
mels = self.mel_proj(x)
|
316 |
-
mels = mels.view(batch_size, self.n_mels, self.max_r)[:, :, :self.r]
|
317 |
-
hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
|
318 |
-
cell_states = (rnn1_cell, rnn2_cell)
|
319 |
-
|
320 |
-
# Stop token prediction
|
321 |
-
s = torch.cat((x, context_vec), dim=1)
|
322 |
-
s = self.stop_proj(s)
|
323 |
-
stop_tokens = torch.sigmoid(s)
|
324 |
-
|
325 |
-
return mels, scores, hidden_states, cell_states, context_vec, stop_tokens
|
326 |
-
|
327 |
-
|
328 |
-
class Tacotron(nn.Module):
|
329 |
-
def __init__(self, embed_dims, num_chars, encoder_dims, decoder_dims, n_mels,
|
330 |
-
fft_bins, postnet_dims, encoder_K, lstm_dims, postnet_K, num_highways,
|
331 |
-
dropout, stop_threshold, speaker_embedding_size):
|
332 |
-
super().__init__()
|
333 |
-
self.n_mels = n_mels
|
334 |
-
self.lstm_dims = lstm_dims
|
335 |
-
self.encoder_dims = encoder_dims
|
336 |
-
self.decoder_dims = decoder_dims
|
337 |
-
self.speaker_embedding_size = speaker_embedding_size
|
338 |
-
self.encoder = Encoder(embed_dims, num_chars, encoder_dims,
|
339 |
-
encoder_K, num_highways, dropout)
|
340 |
-
self.encoder_proj = nn.Linear(encoder_dims + speaker_embedding_size, decoder_dims, bias=False)
|
341 |
-
self.decoder = Decoder(n_mels, encoder_dims, decoder_dims, lstm_dims,
|
342 |
-
dropout, speaker_embedding_size)
|
343 |
-
self.postnet = CBHG(postnet_K, n_mels, postnet_dims,
|
344 |
-
[postnet_dims, fft_bins], num_highways)
|
345 |
-
self.post_proj = nn.Linear(postnet_dims, fft_bins, bias=False)
|
346 |
-
|
347 |
-
self.init_model()
|
348 |
-
self.num_params()
|
349 |
-
|
350 |
-
self.register_buffer("step", torch.zeros(1, dtype=torch.long))
|
351 |
-
self.register_buffer("stop_threshold", torch.tensor(stop_threshold, dtype=torch.float32))
|
352 |
-
|
353 |
-
@property
|
354 |
-
def r(self):
|
355 |
-
return self.decoder.r.item()
|
356 |
-
|
357 |
-
@r.setter
|
358 |
-
def r(self, value):
|
359 |
-
self.decoder.r = self.decoder.r.new_tensor(value, requires_grad=False)
|
360 |
-
|
361 |
-
def forward(self, x, m, speaker_embedding):
|
362 |
-
device = next(self.parameters()).device # use same device as parameters
|
363 |
-
|
364 |
-
self.step += 1
|
365 |
-
batch_size, _, steps = m.size()
|
366 |
-
|
367 |
-
# Initialise all hidden states and pack into tuple
|
368 |
-
attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
|
369 |
-
rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
|
370 |
-
rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
|
371 |
-
hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
|
372 |
-
|
373 |
-
# Initialise all lstm cell states and pack into tuple
|
374 |
-
rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
|
375 |
-
rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
|
376 |
-
cell_states = (rnn1_cell, rnn2_cell)
|
377 |
-
|
378 |
-
# <GO> Frame for start of decoder loop
|
379 |
-
go_frame = torch.zeros(batch_size, self.n_mels, device=device)
|
380 |
-
|
381 |
-
# Need an initial context vector
|
382 |
-
context_vec = torch.zeros(batch_size, self.encoder_dims + self.speaker_embedding_size, device=device)
|
383 |
-
|
384 |
-
# SV2TTS: Run the encoder with the speaker embedding
|
385 |
-
# The projection avoids unnecessary matmuls in the decoder loop
|
386 |
-
encoder_seq = self.encoder(x, speaker_embedding)
|
387 |
-
encoder_seq_proj = self.encoder_proj(encoder_seq)
|
388 |
-
|
389 |
-
# Need a couple of lists for outputs
|
390 |
-
mel_outputs, attn_scores, stop_outputs = [], [], []
|
391 |
-
|
392 |
-
# Run the decoder loop
|
393 |
-
for t in range(0, steps, self.r):
|
394 |
-
prenet_in = m[:, :, t - 1] if t > 0 else go_frame
|
395 |
-
mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
|
396 |
-
self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
|
397 |
-
hidden_states, cell_states, context_vec, t, x)
|
398 |
-
mel_outputs.append(mel_frames)
|
399 |
-
attn_scores.append(scores)
|
400 |
-
stop_outputs.extend([stop_tokens] * self.r)
|
401 |
-
|
402 |
-
# Concat the mel outputs into sequence
|
403 |
-
mel_outputs = torch.cat(mel_outputs, dim=2)
|
404 |
-
|
405 |
-
# Post-Process for Linear Spectrograms
|
406 |
-
postnet_out = self.postnet(mel_outputs)
|
407 |
-
linear = self.post_proj(postnet_out)
|
408 |
-
linear = linear.transpose(1, 2)
|
409 |
-
|
410 |
-
# For easy visualisation
|
411 |
-
attn_scores = torch.cat(attn_scores, 1)
|
412 |
-
# attn_scores = attn_scores.cpu().data.numpy()
|
413 |
-
stop_outputs = torch.cat(stop_outputs, 1)
|
414 |
-
|
415 |
-
return mel_outputs, linear, attn_scores, stop_outputs
|
416 |
-
|
417 |
-
def generate(self, x, speaker_embedding=None, steps=2000):
|
418 |
-
self.eval()
|
419 |
-
device = next(self.parameters()).device # use same device as parameters
|
420 |
-
|
421 |
-
batch_size, _ = x.size()
|
422 |
-
|
423 |
-
# Need to initialise all hidden states and pack into tuple for tidyness
|
424 |
-
attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
|
425 |
-
rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
|
426 |
-
rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
|
427 |
-
hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
|
428 |
-
|
429 |
-
# Need to initialise all lstm cell states and pack into tuple for tidyness
|
430 |
-
rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
|
431 |
-
rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
|
432 |
-
cell_states = (rnn1_cell, rnn2_cell)
|
433 |
-
|
434 |
-
# Need a <GO> Frame for start of decoder loop
|
435 |
-
go_frame = torch.zeros(batch_size, self.n_mels, device=device)
|
436 |
-
|
437 |
-
# Need an initial context vector
|
438 |
-
context_vec = torch.zeros(batch_size, self.encoder_dims + self.speaker_embedding_size, device=device)
|
439 |
-
|
440 |
-
# SV2TTS: Run the encoder with the speaker embedding
|
441 |
-
# The projection avoids unnecessary matmuls in the decoder loop
|
442 |
-
encoder_seq = self.encoder(x, speaker_embedding)
|
443 |
-
encoder_seq_proj = self.encoder_proj(encoder_seq)
|
444 |
-
|
445 |
-
# Need a couple of lists for outputs
|
446 |
-
mel_outputs, attn_scores, stop_outputs = [], [], []
|
447 |
-
|
448 |
-
# Run the decoder loop
|
449 |
-
for t in range(0, steps, self.r):
|
450 |
-
prenet_in = mel_outputs[-1][:, :, -1] if t > 0 else go_frame
|
451 |
-
mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
|
452 |
-
self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
|
453 |
-
hidden_states, cell_states, context_vec, t, x)
|
454 |
-
mel_outputs.append(mel_frames)
|
455 |
-
attn_scores.append(scores)
|
456 |
-
stop_outputs.extend([stop_tokens] * self.r)
|
457 |
-
# Stop the loop when all stop tokens in batch exceed threshold
|
458 |
-
if (stop_tokens > 0.5).all() and t > 10: break
|
459 |
-
|
460 |
-
# Concat the mel outputs into sequence
|
461 |
-
mel_outputs = torch.cat(mel_outputs, dim=2)
|
462 |
-
|
463 |
-
# Post-Process for Linear Spectrograms
|
464 |
-
postnet_out = self.postnet(mel_outputs)
|
465 |
-
linear = self.post_proj(postnet_out)
|
466 |
-
|
467 |
-
|
468 |
-
linear = linear.transpose(1, 2)
|
469 |
-
|
470 |
-
# For easy visualisation
|
471 |
-
attn_scores = torch.cat(attn_scores, 1)
|
472 |
-
stop_outputs = torch.cat(stop_outputs, 1)
|
473 |
-
|
474 |
-
self.train()
|
475 |
-
|
476 |
-
return mel_outputs, linear, attn_scores
|
477 |
-
|
478 |
-
def init_model(self):
|
479 |
-
for p in self.parameters():
|
480 |
-
if p.dim() > 1: nn.init.xavier_uniform_(p)
|
481 |
-
|
482 |
-
def get_step(self):
|
483 |
-
return self.step.data.item()
|
484 |
-
|
485 |
-
def reset_step(self):
|
486 |
-
# assignment to parameters or buffers is overloaded, updates internal dict entry
|
487 |
-
self.step = self.step.data.new_tensor(1)
|
488 |
-
|
489 |
-
def log(self, path, msg):
|
490 |
-
with open(path, "a") as f:
|
491 |
-
print(msg, file=f)
|
492 |
-
|
493 |
-
def load(self, path, optimizer=None):
|
494 |
-
# Use device of model params as location for loaded state
|
495 |
-
device = next(self.parameters()).device
|
496 |
-
checkpoint = torch.load(str(path), map_location=device)
|
497 |
-
self.load_state_dict(checkpoint["model_state"])
|
498 |
-
|
499 |
-
if "optimizer_state" in checkpoint and optimizer is not None:
|
500 |
-
optimizer.load_state_dict(checkpoint["optimizer_state"])
|
501 |
-
|
502 |
-
def save(self, path, optimizer=None):
|
503 |
-
if optimizer is not None:
|
504 |
-
torch.save({
|
505 |
-
"model_state": self.state_dict(),
|
506 |
-
"optimizer_state": optimizer.state_dict(),
|
507 |
-
}, str(path))
|
508 |
-
else:
|
509 |
-
torch.save({
|
510 |
-
"model_state": self.state_dict(),
|
511 |
-
}, str(path))
|
512 |
-
|
513 |
-
|
514 |
-
def num_params(self, print_out=True):
|
515 |
-
parameters = filter(lambda p: p.requires_grad, self.parameters())
|
516 |
-
parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
|
517 |
-
if print_out:
|
518 |
-
print("Trainable Parameters: %.3fM" % parameters)
|
519 |
-
return parameters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/preprocess.py
DELETED
@@ -1,258 +0,0 @@
|
|
1 |
-
from multiprocessing.pool import Pool
|
2 |
-
from synthesizer import audio
|
3 |
-
from functools import partial
|
4 |
-
from itertools import chain
|
5 |
-
from encoder import inference as encoder
|
6 |
-
from pathlib import Path
|
7 |
-
from utils import logmmse
|
8 |
-
from tqdm import tqdm
|
9 |
-
import numpy as np
|
10 |
-
import librosa
|
11 |
-
|
12 |
-
|
13 |
-
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
|
14 |
-
no_alignments: bool, datasets_name: str, subfolders: str):
|
15 |
-
# Gather the input directories
|
16 |
-
dataset_root = datasets_root.joinpath(datasets_name)
|
17 |
-
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
|
18 |
-
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
|
19 |
-
assert all(input_dir.exists() for input_dir in input_dirs)
|
20 |
-
|
21 |
-
# Create the output directories for each output file type
|
22 |
-
out_dir.joinpath("mels").mkdir(exist_ok=True)
|
23 |
-
out_dir.joinpath("audio").mkdir(exist_ok=True)
|
24 |
-
|
25 |
-
# Create a metadata file
|
26 |
-
metadata_fpath = out_dir.joinpath("train.txt")
|
27 |
-
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
|
28 |
-
|
29 |
-
# Preprocess the dataset
|
30 |
-
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
31 |
-
func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing,
|
32 |
-
hparams=hparams, no_alignments=no_alignments)
|
33 |
-
job = Pool(n_processes).imap(func, speaker_dirs)
|
34 |
-
for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
|
35 |
-
for metadatum in speaker_metadata:
|
36 |
-
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
|
37 |
-
metadata_file.close()
|
38 |
-
|
39 |
-
# Verify the contents of the metadata file
|
40 |
-
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
41 |
-
metadata = [line.split("|") for line in metadata_file]
|
42 |
-
mel_frames = sum([int(m[4]) for m in metadata])
|
43 |
-
timesteps = sum([int(m[3]) for m in metadata])
|
44 |
-
sample_rate = hparams.sample_rate
|
45 |
-
hours = (timesteps / sample_rate) / 3600
|
46 |
-
print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
|
47 |
-
(len(metadata), mel_frames, timesteps, hours))
|
48 |
-
print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
|
49 |
-
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
50 |
-
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
51 |
-
|
52 |
-
|
53 |
-
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
|
54 |
-
metadata = []
|
55 |
-
for book_dir in speaker_dir.glob("*"):
|
56 |
-
if no_alignments:
|
57 |
-
# Gather the utterance audios and texts
|
58 |
-
# LibriTTS uses .wav but we will include extensions for compatibility with other datasets
|
59 |
-
extensions = ["*.wav", "*.flac", "*.mp3"]
|
60 |
-
for extension in extensions:
|
61 |
-
wav_fpaths = book_dir.glob(extension)
|
62 |
-
|
63 |
-
for wav_fpath in wav_fpaths:
|
64 |
-
# Load the audio waveform
|
65 |
-
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
66 |
-
if hparams.rescale:
|
67 |
-
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
68 |
-
|
69 |
-
# Get the corresponding text
|
70 |
-
# Check for .txt (for compatibility with other datasets)
|
71 |
-
text_fpath = wav_fpath.with_suffix(".txt")
|
72 |
-
if not text_fpath.exists():
|
73 |
-
# Check for .normalized.txt (LibriTTS)
|
74 |
-
text_fpath = wav_fpath.with_suffix(".normalized.txt")
|
75 |
-
assert text_fpath.exists()
|
76 |
-
with text_fpath.open("r") as text_file:
|
77 |
-
text = "".join([line for line in text_file])
|
78 |
-
text = text.replace("\"", "")
|
79 |
-
text = text.strip()
|
80 |
-
|
81 |
-
# Process the utterance
|
82 |
-
metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
|
83 |
-
skip_existing, hparams))
|
84 |
-
else:
|
85 |
-
# Process alignment file (LibriSpeech support)
|
86 |
-
# Gather the utterance audios and texts
|
87 |
-
try:
|
88 |
-
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
|
89 |
-
with alignments_fpath.open("r") as alignments_file:
|
90 |
-
alignments = [line.rstrip().split(" ") for line in alignments_file]
|
91 |
-
except StopIteration:
|
92 |
-
# A few alignment files will be missing
|
93 |
-
continue
|
94 |
-
|
95 |
-
# Iterate over each entry in the alignments file
|
96 |
-
for wav_fname, words, end_times in alignments:
|
97 |
-
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
|
98 |
-
assert wav_fpath.exists()
|
99 |
-
words = words.replace("\"", "").split(",")
|
100 |
-
end_times = list(map(float, end_times.replace("\"", "").split(",")))
|
101 |
-
|
102 |
-
# Process each sub-utterance
|
103 |
-
wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
|
104 |
-
for i, (wav, text) in enumerate(zip(wavs, texts)):
|
105 |
-
sub_basename = "%s_%02d" % (wav_fname, i)
|
106 |
-
metadata.append(process_utterance(wav, text, out_dir, sub_basename,
|
107 |
-
skip_existing, hparams))
|
108 |
-
|
109 |
-
return [m for m in metadata if m is not None]
|
110 |
-
|
111 |
-
|
112 |
-
def split_on_silences(wav_fpath, words, end_times, hparams):
|
113 |
-
# Load the audio waveform
|
114 |
-
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
115 |
-
if hparams.rescale:
|
116 |
-
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
117 |
-
|
118 |
-
words = np.array(words)
|
119 |
-
start_times = np.array([0.0] + end_times[:-1])
|
120 |
-
end_times = np.array(end_times)
|
121 |
-
assert len(words) == len(end_times) == len(start_times)
|
122 |
-
assert words[0] == "" and words[-1] == ""
|
123 |
-
|
124 |
-
# Find pauses that are too long
|
125 |
-
mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
|
126 |
-
mask[0] = mask[-1] = True
|
127 |
-
breaks = np.where(mask)[0]
|
128 |
-
|
129 |
-
# Profile the noise from the silences and perform noise reduction on the waveform
|
130 |
-
silence_times = [[start_times[i], end_times[i]] for i in breaks]
|
131 |
-
silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
|
132 |
-
noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
|
133 |
-
if len(noisy_wav) > hparams.sample_rate * 0.02:
|
134 |
-
profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
|
135 |
-
wav = logmmse.denoise(wav, profile, eta=0)
|
136 |
-
|
137 |
-
# Re-attach segments that are too short
|
138 |
-
segments = list(zip(breaks[:-1], breaks[1:]))
|
139 |
-
segment_durations = [start_times[end] - end_times[start] for start, end in segments]
|
140 |
-
i = 0
|
141 |
-
while i < len(segments) and len(segments) > 1:
|
142 |
-
if segment_durations[i] < hparams.utterance_min_duration:
|
143 |
-
# See if the segment can be re-attached with the right or the left segment
|
144 |
-
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
|
145 |
-
right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
|
146 |
-
joined_duration = segment_durations[i] + min(left_duration, right_duration)
|
147 |
-
|
148 |
-
# Do not re-attach if it causes the joined utterance to be too long
|
149 |
-
if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
|
150 |
-
i += 1
|
151 |
-
continue
|
152 |
-
|
153 |
-
# Re-attach the segment with the neighbour of shortest duration
|
154 |
-
j = i - 1 if left_duration <= right_duration else i
|
155 |
-
segments[j] = (segments[j][0], segments[j + 1][1])
|
156 |
-
segment_durations[j] = joined_duration
|
157 |
-
del segments[j + 1], segment_durations[j + 1]
|
158 |
-
else:
|
159 |
-
i += 1
|
160 |
-
|
161 |
-
# Split the utterance
|
162 |
-
segment_times = [[end_times[start], start_times[end]] for start, end in segments]
|
163 |
-
segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
|
164 |
-
wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
|
165 |
-
texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]
|
166 |
-
|
167 |
-
# # DEBUG: play the audio segments (run with -n=1)
|
168 |
-
# import sounddevice as sd
|
169 |
-
# if len(wavs) > 1:
|
170 |
-
# print("This sentence was split in %d segments:" % len(wavs))
|
171 |
-
# else:
|
172 |
-
# print("There are no silences long enough for this sentence to be split:")
|
173 |
-
# for wav, text in zip(wavs, texts):
|
174 |
-
# # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
|
175 |
-
# # when playing them. You shouldn't need to do that in your parsers.
|
176 |
-
# wav = np.concatenate((wav, [0] * 16000))
|
177 |
-
# print("\t%s" % text)
|
178 |
-
# sd.play(wav, 16000, blocking=True)
|
179 |
-
# print("")
|
180 |
-
|
181 |
-
return wavs, texts
|
182 |
-
|
183 |
-
|
184 |
-
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
185 |
-
skip_existing: bool, hparams):
|
186 |
-
## FOR REFERENCE:
|
187 |
-
# For you not to lose your head if you ever wish to change things here or implement your own
|
188 |
-
# synthesizer.
|
189 |
-
# - Both the audios and the mel spectrograms are saved as numpy arrays
|
190 |
-
# - There is no processing done to the audios that will be saved to disk beyond volume
|
191 |
-
# normalization (in split_on_silences)
|
192 |
-
# - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
|
193 |
-
# is why we re-apply it on the audio on the side of the vocoder.
|
194 |
-
# - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
|
195 |
-
# without extra padding. This means that you won't have an exact relation between the length
|
196 |
-
# of the wav and of the mel spectrogram. See the vocoder data loader.
|
197 |
-
|
198 |
-
|
199 |
-
# Skip existing utterances if needed
|
200 |
-
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
|
201 |
-
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
|
202 |
-
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
|
203 |
-
return None
|
204 |
-
|
205 |
-
# Trim silence
|
206 |
-
if hparams.trim_silence:
|
207 |
-
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
|
208 |
-
|
209 |
-
# Skip utterances that are too short
|
210 |
-
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
|
211 |
-
return None
|
212 |
-
|
213 |
-
# Compute the mel spectrogram
|
214 |
-
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
215 |
-
mel_frames = mel_spectrogram.shape[1]
|
216 |
-
|
217 |
-
# Skip utterances that are too long
|
218 |
-
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
|
219 |
-
return None
|
220 |
-
|
221 |
-
# Write the spectrogram, embed and audio to disk
|
222 |
-
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
|
223 |
-
np.save(wav_fpath, wav, allow_pickle=False)
|
224 |
-
|
225 |
-
# Return a tuple describing this training example
|
226 |
-
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
|
227 |
-
|
228 |
-
|
229 |
-
def embed_utterance(fpaths, encoder_model_fpath):
|
230 |
-
if not encoder.is_loaded():
|
231 |
-
encoder.load_model(encoder_model_fpath)
|
232 |
-
|
233 |
-
# Compute the speaker embedding of the utterance
|
234 |
-
wav_fpath, embed_fpath = fpaths
|
235 |
-
wav = np.load(wav_fpath)
|
236 |
-
wav = encoder.preprocess_wav(wav)
|
237 |
-
embed = encoder.embed_utterance(wav)
|
238 |
-
np.save(embed_fpath, embed, allow_pickle=False)
|
239 |
-
|
240 |
-
|
241 |
-
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
|
242 |
-
wav_dir = synthesizer_root.joinpath("audio")
|
243 |
-
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
244 |
-
assert wav_dir.exists() and metadata_fpath.exists()
|
245 |
-
embed_dir = synthesizer_root.joinpath("embeds")
|
246 |
-
embed_dir.mkdir(exist_ok=True)
|
247 |
-
|
248 |
-
# Gather the input wave filepath and the target output embed filepath
|
249 |
-
with metadata_fpath.open("r") as metadata_file:
|
250 |
-
metadata = [line.split("|") for line in metadata_file]
|
251 |
-
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
|
252 |
-
|
253 |
-
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
254 |
-
# Embed the utterances in separate threads
|
255 |
-
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
256 |
-
job = Pool(n_processes).imap(func, fpaths)
|
257 |
-
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/synthesize.py
DELETED
@@ -1,92 +0,0 @@
|
|
1 |
-
import platform
|
2 |
-
from functools import partial
|
3 |
-
from pathlib import Path
|
4 |
-
|
5 |
-
import numpy as np
|
6 |
-
import torch
|
7 |
-
from torch.utils.data import DataLoader
|
8 |
-
from tqdm import tqdm
|
9 |
-
|
10 |
-
from synthesizer.hparams import hparams_debug_string
|
11 |
-
from synthesizer.models.tacotron import Tacotron
|
12 |
-
from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
|
13 |
-
from synthesizer.utils import data_parallel_workaround
|
14 |
-
from synthesizer.utils.symbols import symbols
|
15 |
-
|
16 |
-
|
17 |
-
def run_synthesis(in_dir: Path, out_dir: Path, syn_model_fpath: Path, hparams):
|
18 |
-
# This generates ground truth-aligned mels for vocoder training
|
19 |
-
synth_dir = out_dir / "mels_gta"
|
20 |
-
synth_dir.mkdir(exist_ok=True, parents=True)
|
21 |
-
print(hparams_debug_string())
|
22 |
-
|
23 |
-
# Check for GPU
|
24 |
-
if torch.cuda.is_available():
|
25 |
-
device = torch.device("cuda")
|
26 |
-
if hparams.synthesis_batch_size % torch.cuda.device_count() != 0:
|
27 |
-
raise ValueError("`hparams.synthesis_batch_size` must be evenly divisible by n_gpus!")
|
28 |
-
else:
|
29 |
-
device = torch.device("cpu")
|
30 |
-
print("Synthesizer using device:", device)
|
31 |
-
|
32 |
-
# Instantiate Tacotron model
|
33 |
-
model = Tacotron(embed_dims=hparams.tts_embed_dims,
|
34 |
-
num_chars=len(symbols),
|
35 |
-
encoder_dims=hparams.tts_encoder_dims,
|
36 |
-
decoder_dims=hparams.tts_decoder_dims,
|
37 |
-
n_mels=hparams.num_mels,
|
38 |
-
fft_bins=hparams.num_mels,
|
39 |
-
postnet_dims=hparams.tts_postnet_dims,
|
40 |
-
encoder_K=hparams.tts_encoder_K,
|
41 |
-
lstm_dims=hparams.tts_lstm_dims,
|
42 |
-
postnet_K=hparams.tts_postnet_K,
|
43 |
-
num_highways=hparams.tts_num_highways,
|
44 |
-
dropout=0., # Use zero dropout for gta mels
|
45 |
-
stop_threshold=hparams.tts_stop_threshold,
|
46 |
-
speaker_embedding_size=hparams.speaker_embedding_size).to(device)
|
47 |
-
|
48 |
-
# Load the weights
|
49 |
-
print("\nLoading weights at %s" % syn_model_fpath)
|
50 |
-
model.load(syn_model_fpath)
|
51 |
-
print("Tacotron weights loaded from step %d" % model.step)
|
52 |
-
|
53 |
-
# Synthesize using same reduction factor as the model is currently trained
|
54 |
-
r = np.int32(model.r)
|
55 |
-
|
56 |
-
# Set model to eval mode (disable gradient and zoneout)
|
57 |
-
model.eval()
|
58 |
-
|
59 |
-
# Initialize the dataset
|
60 |
-
metadata_fpath = in_dir.joinpath("train.txt")
|
61 |
-
mel_dir = in_dir.joinpath("mels")
|
62 |
-
embed_dir = in_dir.joinpath("embeds")
|
63 |
-
|
64 |
-
dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
|
65 |
-
collate_fn = partial(collate_synthesizer, r=r, hparams=hparams)
|
66 |
-
data_loader = DataLoader(dataset, hparams.synthesis_batch_size, collate_fn=collate_fn, num_workers=2)
|
67 |
-
|
68 |
-
# Generate GTA mels
|
69 |
-
meta_out_fpath = out_dir / "synthesized.txt"
|
70 |
-
with meta_out_fpath.open("w") as file:
|
71 |
-
for i, (texts, mels, embeds, idx) in tqdm(enumerate(data_loader), total=len(data_loader)):
|
72 |
-
texts, mels, embeds = texts.to(device), mels.to(device), embeds.to(device)
|
73 |
-
|
74 |
-
# Parallelize model onto GPUS using workaround due to python bug
|
75 |
-
if device.type == "cuda" and torch.cuda.device_count() > 1:
|
76 |
-
_, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)
|
77 |
-
else:
|
78 |
-
_, mels_out, _, _ = model(texts, mels, embeds)
|
79 |
-
|
80 |
-
for j, k in enumerate(idx):
|
81 |
-
# Note: outputs mel-spectrogram files and target ones have same names, just different folders
|
82 |
-
mel_filename = Path(synth_dir).joinpath(dataset.metadata[k][1])
|
83 |
-
mel_out = mels_out[j].detach().cpu().numpy().T
|
84 |
-
|
85 |
-
# Use the length of the ground truth mel to remove padding from the generated mels
|
86 |
-
mel_out = mel_out[:int(dataset.metadata[k][4])]
|
87 |
-
|
88 |
-
# Write the spectrogram to disk
|
89 |
-
np.save(mel_filename, mel_out, allow_pickle=False)
|
90 |
-
|
91 |
-
# Write metadata into the synthesized file
|
92 |
-
file.write("|".join(dataset.metadata[k]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/synthesizer_dataset.py
DELETED
@@ -1,92 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from torch.utils.data import Dataset
|
3 |
-
import numpy as np
|
4 |
-
from pathlib import Path
|
5 |
-
from synthesizer.utils.text import text_to_sequence
|
6 |
-
|
7 |
-
|
8 |
-
class SynthesizerDataset(Dataset):
|
9 |
-
def __init__(self, metadata_fpath: Path, mel_dir: Path, embed_dir: Path, hparams):
|
10 |
-
print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, embed_dir))
|
11 |
-
|
12 |
-
with metadata_fpath.open("r") as metadata_file:
|
13 |
-
metadata = [line.split("|") for line in metadata_file]
|
14 |
-
|
15 |
-
mel_fnames = [x[1] for x in metadata if int(x[4])]
|
16 |
-
mel_fpaths = [mel_dir.joinpath(fname) for fname in mel_fnames]
|
17 |
-
embed_fnames = [x[2] for x in metadata if int(x[4])]
|
18 |
-
embed_fpaths = [embed_dir.joinpath(fname) for fname in embed_fnames]
|
19 |
-
self.samples_fpaths = list(zip(mel_fpaths, embed_fpaths))
|
20 |
-
self.samples_texts = [x[5].strip() for x in metadata if int(x[4])]
|
21 |
-
self.metadata = metadata
|
22 |
-
self.hparams = hparams
|
23 |
-
|
24 |
-
print("Found %d samples" % len(self.samples_fpaths))
|
25 |
-
|
26 |
-
def __getitem__(self, index):
|
27 |
-
# Sometimes index may be a list of 2 (not sure why this happens)
|
28 |
-
# If that is the case, return a single item corresponding to first element in index
|
29 |
-
if index is list:
|
30 |
-
index = index[0]
|
31 |
-
|
32 |
-
mel_path, embed_path = self.samples_fpaths[index]
|
33 |
-
mel = np.load(mel_path).T.astype(np.float32)
|
34 |
-
|
35 |
-
# Load the embed
|
36 |
-
embed = np.load(embed_path)
|
37 |
-
|
38 |
-
# Get the text and clean it
|
39 |
-
text = text_to_sequence(self.samples_texts[index], self.hparams.tts_cleaner_names)
|
40 |
-
|
41 |
-
# Convert the list returned by text_to_sequence to a numpy array
|
42 |
-
text = np.asarray(text).astype(np.int32)
|
43 |
-
|
44 |
-
return text, mel.astype(np.float32), embed.astype(np.float32), index
|
45 |
-
|
46 |
-
def __len__(self):
|
47 |
-
return len(self.samples_fpaths)
|
48 |
-
|
49 |
-
|
50 |
-
def collate_synthesizer(batch, r, hparams):
|
51 |
-
# Text
|
52 |
-
x_lens = [len(x[0]) for x in batch]
|
53 |
-
max_x_len = max(x_lens)
|
54 |
-
|
55 |
-
chars = [pad1d(x[0], max_x_len) for x in batch]
|
56 |
-
chars = np.stack(chars)
|
57 |
-
|
58 |
-
# Mel spectrogram
|
59 |
-
spec_lens = [x[1].shape[-1] for x in batch]
|
60 |
-
max_spec_len = max(spec_lens) + 1
|
61 |
-
if max_spec_len % r != 0:
|
62 |
-
max_spec_len += r - max_spec_len % r
|
63 |
-
|
64 |
-
# WaveRNN mel spectrograms are normalized to [0, 1] so zero padding adds silence
|
65 |
-
# By default, SV2TTS uses symmetric mels, where -1*max_abs_value is silence.
|
66 |
-
if hparams.symmetric_mels:
|
67 |
-
mel_pad_value = -1 * hparams.max_abs_value
|
68 |
-
else:
|
69 |
-
mel_pad_value = 0
|
70 |
-
|
71 |
-
mel = [pad2d(x[1], max_spec_len, pad_value=mel_pad_value) for x in batch]
|
72 |
-
mel = np.stack(mel)
|
73 |
-
|
74 |
-
# Speaker embedding (SV2TTS)
|
75 |
-
embeds = np.array([x[2] for x in batch])
|
76 |
-
|
77 |
-
# Index (for vocoder preprocessing)
|
78 |
-
indices = [x[3] for x in batch]
|
79 |
-
|
80 |
-
|
81 |
-
# Convert all to tensor
|
82 |
-
chars = torch.tensor(chars).long()
|
83 |
-
mel = torch.tensor(mel)
|
84 |
-
embeds = torch.tensor(embeds)
|
85 |
-
|
86 |
-
return chars, mel, embeds, indices
|
87 |
-
|
88 |
-
def pad1d(x, max_len, pad_value=0):
|
89 |
-
return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
|
90 |
-
|
91 |
-
def pad2d(x, max_len, pad_value=0):
|
92 |
-
return np.pad(x, ((0, 0), (0, max_len - x.shape[-1])), mode="constant", constant_values=pad_value)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/train.py
DELETED
@@ -1,258 +0,0 @@
|
|
1 |
-
from datetime import datetime
|
2 |
-
from functools import partial
|
3 |
-
from pathlib import Path
|
4 |
-
|
5 |
-
import torch
|
6 |
-
import torch.nn.functional as F
|
7 |
-
from torch import optim
|
8 |
-
from torch.utils.data import DataLoader
|
9 |
-
|
10 |
-
from synthesizer import audio
|
11 |
-
from synthesizer.models.tacotron import Tacotron
|
12 |
-
from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
|
13 |
-
from synthesizer.utils import ValueWindow, data_parallel_workaround
|
14 |
-
from synthesizer.utils.plot import plot_spectrogram
|
15 |
-
from synthesizer.utils.symbols import symbols
|
16 |
-
from synthesizer.utils.text import sequence_to_text
|
17 |
-
from vocoder.display import *
|
18 |
-
|
19 |
-
|
20 |
-
def np_now(x: torch.Tensor): return x.detach().cpu().numpy()
|
21 |
-
|
22 |
-
|
23 |
-
def time_string():
|
24 |
-
return datetime.now().strftime("%Y-%m-%d %H:%M")
|
25 |
-
|
26 |
-
|
27 |
-
def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int, backup_every: int, force_restart: bool,
|
28 |
-
hparams):
|
29 |
-
models_dir.mkdir(exist_ok=True)
|
30 |
-
|
31 |
-
model_dir = models_dir.joinpath(run_id)
|
32 |
-
plot_dir = model_dir.joinpath("plots")
|
33 |
-
wav_dir = model_dir.joinpath("wavs")
|
34 |
-
mel_output_dir = model_dir.joinpath("mel-spectrograms")
|
35 |
-
meta_folder = model_dir.joinpath("metas")
|
36 |
-
model_dir.mkdir(exist_ok=True)
|
37 |
-
plot_dir.mkdir(exist_ok=True)
|
38 |
-
wav_dir.mkdir(exist_ok=True)
|
39 |
-
mel_output_dir.mkdir(exist_ok=True)
|
40 |
-
meta_folder.mkdir(exist_ok=True)
|
41 |
-
|
42 |
-
weights_fpath = model_dir / f"synthesizer.pt"
|
43 |
-
metadata_fpath = syn_dir.joinpath("train.txt")
|
44 |
-
|
45 |
-
print("Checkpoint path: {}".format(weights_fpath))
|
46 |
-
print("Loading training data from: {}".format(metadata_fpath))
|
47 |
-
print("Using model: Tacotron")
|
48 |
-
|
49 |
-
# Bookkeeping
|
50 |
-
time_window = ValueWindow(100)
|
51 |
-
loss_window = ValueWindow(100)
|
52 |
-
|
53 |
-
# From WaveRNN/train_tacotron.py
|
54 |
-
if torch.cuda.is_available():
|
55 |
-
device = torch.device("cuda")
|
56 |
-
|
57 |
-
for session in hparams.tts_schedule:
|
58 |
-
_, _, _, batch_size = session
|
59 |
-
if batch_size % torch.cuda.device_count() != 0:
|
60 |
-
raise ValueError("`batch_size` must be evenly divisible by n_gpus!")
|
61 |
-
else:
|
62 |
-
device = torch.device("cpu")
|
63 |
-
print("Using device:", device)
|
64 |
-
|
65 |
-
# Instantiate Tacotron Model
|
66 |
-
print("\nInitialising Tacotron Model...\n")
|
67 |
-
model = Tacotron(embed_dims=hparams.tts_embed_dims,
|
68 |
-
num_chars=len(symbols),
|
69 |
-
encoder_dims=hparams.tts_encoder_dims,
|
70 |
-
decoder_dims=hparams.tts_decoder_dims,
|
71 |
-
n_mels=hparams.num_mels,
|
72 |
-
fft_bins=hparams.num_mels,
|
73 |
-
postnet_dims=hparams.tts_postnet_dims,
|
74 |
-
encoder_K=hparams.tts_encoder_K,
|
75 |
-
lstm_dims=hparams.tts_lstm_dims,
|
76 |
-
postnet_K=hparams.tts_postnet_K,
|
77 |
-
num_highways=hparams.tts_num_highways,
|
78 |
-
dropout=hparams.tts_dropout,
|
79 |
-
stop_threshold=hparams.tts_stop_threshold,
|
80 |
-
speaker_embedding_size=hparams.speaker_embedding_size).to(device)
|
81 |
-
|
82 |
-
# Initialize the optimizer
|
83 |
-
optimizer = optim.Adam(model.parameters())
|
84 |
-
|
85 |
-
# Load the weights
|
86 |
-
if force_restart or not weights_fpath.exists():
|
87 |
-
print("\nStarting the training of Tacotron from scratch\n")
|
88 |
-
model.save(weights_fpath)
|
89 |
-
|
90 |
-
# Embeddings metadata
|
91 |
-
char_embedding_fpath = meta_folder.joinpath("CharacterEmbeddings.tsv")
|
92 |
-
with open(char_embedding_fpath, "w", encoding="utf-8") as f:
|
93 |
-
for symbol in symbols:
|
94 |
-
if symbol == " ":
|
95 |
-
symbol = "\\s" # For visual purposes, swap space with \s
|
96 |
-
|
97 |
-
f.write("{}\n".format(symbol))
|
98 |
-
|
99 |
-
else:
|
100 |
-
print("\nLoading weights at %s" % weights_fpath)
|
101 |
-
model.load(weights_fpath, optimizer)
|
102 |
-
print("Tacotron weights loaded from step %d" % model.step)
|
103 |
-
|
104 |
-
# Initialize the dataset
|
105 |
-
metadata_fpath = syn_dir.joinpath("train.txt")
|
106 |
-
mel_dir = syn_dir.joinpath("mels")
|
107 |
-
embed_dir = syn_dir.joinpath("embeds")
|
108 |
-
dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
|
109 |
-
|
110 |
-
for i, session in enumerate(hparams.tts_schedule):
|
111 |
-
current_step = model.get_step()
|
112 |
-
|
113 |
-
r, lr, max_step, batch_size = session
|
114 |
-
|
115 |
-
training_steps = max_step - current_step
|
116 |
-
|
117 |
-
# Do we need to change to the next session?
|
118 |
-
if current_step >= max_step:
|
119 |
-
# Are there no further sessions than the current one?
|
120 |
-
if i == len(hparams.tts_schedule) - 1:
|
121 |
-
# We have completed training. Save the model and exit
|
122 |
-
model.save(weights_fpath, optimizer)
|
123 |
-
break
|
124 |
-
else:
|
125 |
-
# There is a following session, go to it
|
126 |
-
continue
|
127 |
-
|
128 |
-
model.r = r
|
129 |
-
|
130 |
-
# Begin the training
|
131 |
-
simple_table([(f"Steps with r={r}", str(training_steps // 1000) + "k Steps"),
|
132 |
-
("Batch Size", batch_size),
|
133 |
-
("Learning Rate", lr),
|
134 |
-
("Outputs/Step (r)", model.r)])
|
135 |
-
|
136 |
-
for p in optimizer.param_groups:
|
137 |
-
p["lr"] = lr
|
138 |
-
|
139 |
-
collate_fn = partial(collate_synthesizer, r=r, hparams=hparams)
|
140 |
-
data_loader = DataLoader(dataset, batch_size, shuffle=True, num_workers=2, collate_fn=collate_fn)
|
141 |
-
|
142 |
-
total_iters = len(dataset)
|
143 |
-
steps_per_epoch = np.ceil(total_iters / batch_size).astype(np.int32)
|
144 |
-
epochs = np.ceil(training_steps / steps_per_epoch).astype(np.int32)
|
145 |
-
|
146 |
-
for epoch in range(1, epochs+1):
|
147 |
-
for i, (texts, mels, embeds, idx) in enumerate(data_loader, 1):
|
148 |
-
start_time = time.time()
|
149 |
-
|
150 |
-
# Generate stop tokens for training
|
151 |
-
stop = torch.ones(mels.shape[0], mels.shape[2])
|
152 |
-
for j, k in enumerate(idx):
|
153 |
-
stop[j, :int(dataset.metadata[k][4])-1] = 0
|
154 |
-
|
155 |
-
texts = texts.to(device)
|
156 |
-
mels = mels.to(device)
|
157 |
-
embeds = embeds.to(device)
|
158 |
-
stop = stop.to(device)
|
159 |
-
|
160 |
-
# Forward pass
|
161 |
-
# Parallelize model onto GPUS using workaround due to python bug
|
162 |
-
if device.type == "cuda" and torch.cuda.device_count() > 1:
|
163 |
-
m1_hat, m2_hat, attention, stop_pred = data_parallel_workaround(model, texts, mels, embeds)
|
164 |
-
else:
|
165 |
-
m1_hat, m2_hat, attention, stop_pred = model(texts, mels, embeds)
|
166 |
-
|
167 |
-
# Backward pass
|
168 |
-
m1_loss = F.mse_loss(m1_hat, mels) + F.l1_loss(m1_hat, mels)
|
169 |
-
m2_loss = F.mse_loss(m2_hat, mels)
|
170 |
-
stop_loss = F.binary_cross_entropy(stop_pred, stop)
|
171 |
-
|
172 |
-
loss = m1_loss + m2_loss + stop_loss
|
173 |
-
|
174 |
-
optimizer.zero_grad()
|
175 |
-
loss.backward()
|
176 |
-
|
177 |
-
if hparams.tts_clip_grad_norm is not None:
|
178 |
-
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hparams.tts_clip_grad_norm)
|
179 |
-
if np.isnan(grad_norm.cpu()):
|
180 |
-
print("grad_norm was NaN!")
|
181 |
-
|
182 |
-
optimizer.step()
|
183 |
-
|
184 |
-
time_window.append(time.time() - start_time)
|
185 |
-
loss_window.append(loss.item())
|
186 |
-
|
187 |
-
step = model.get_step()
|
188 |
-
k = step // 1000
|
189 |
-
|
190 |
-
msg = f"| Epoch: {epoch}/{epochs} ({i}/{steps_per_epoch}) | Loss: {loss_window.average:#.4} | " \
|
191 |
-
f"{1./time_window.average:#.2} steps/s | Step: {k}k | "
|
192 |
-
stream(msg)
|
193 |
-
|
194 |
-
# Backup or save model as appropriate
|
195 |
-
if backup_every != 0 and step % backup_every == 0 :
|
196 |
-
backup_fpath = weights_fpath.parent / f"synthesizer_{k:06d}.pt"
|
197 |
-
model.save(backup_fpath, optimizer)
|
198 |
-
|
199 |
-
if save_every != 0 and step % save_every == 0 :
|
200 |
-
# Must save latest optimizer state to ensure that resuming training
|
201 |
-
# doesn't produce artifacts
|
202 |
-
model.save(weights_fpath, optimizer)
|
203 |
-
|
204 |
-
# Evaluate model to generate samples
|
205 |
-
epoch_eval = hparams.tts_eval_interval == -1 and i == steps_per_epoch # If epoch is done
|
206 |
-
step_eval = hparams.tts_eval_interval > 0 and step % hparams.tts_eval_interval == 0 # Every N steps
|
207 |
-
if epoch_eval or step_eval:
|
208 |
-
for sample_idx in range(hparams.tts_eval_num_samples):
|
209 |
-
# At most, generate samples equal to number in the batch
|
210 |
-
if sample_idx + 1 <= len(texts):
|
211 |
-
# Remove padding from mels using frame length in metadata
|
212 |
-
mel_length = int(dataset.metadata[idx[sample_idx]][4])
|
213 |
-
mel_prediction = np_now(m2_hat[sample_idx]).T[:mel_length]
|
214 |
-
target_spectrogram = np_now(mels[sample_idx]).T[:mel_length]
|
215 |
-
attention_len = mel_length // model.r
|
216 |
-
|
217 |
-
eval_model(attention=np_now(attention[sample_idx][:, :attention_len]),
|
218 |
-
mel_prediction=mel_prediction,
|
219 |
-
target_spectrogram=target_spectrogram,
|
220 |
-
input_seq=np_now(texts[sample_idx]),
|
221 |
-
step=step,
|
222 |
-
plot_dir=plot_dir,
|
223 |
-
mel_output_dir=mel_output_dir,
|
224 |
-
wav_dir=wav_dir,
|
225 |
-
sample_num=sample_idx + 1,
|
226 |
-
loss=loss,
|
227 |
-
hparams=hparams)
|
228 |
-
|
229 |
-
# Break out of loop to update training schedule
|
230 |
-
if step >= max_step:
|
231 |
-
break
|
232 |
-
|
233 |
-
# Add line break after every epoch
|
234 |
-
print("")
|
235 |
-
|
236 |
-
|
237 |
-
def eval_model(attention, mel_prediction, target_spectrogram, input_seq, step,
|
238 |
-
plot_dir, mel_output_dir, wav_dir, sample_num, loss, hparams):
|
239 |
-
# Save some results for evaluation
|
240 |
-
attention_path = str(plot_dir.joinpath("attention_step_{}_sample_{}".format(step, sample_num)))
|
241 |
-
save_attention(attention, attention_path)
|
242 |
-
|
243 |
-
# save predicted mel spectrogram to disk (debug)
|
244 |
-
mel_output_fpath = mel_output_dir.joinpath("mel-prediction-step-{}_sample_{}.npy".format(step, sample_num))
|
245 |
-
np.save(str(mel_output_fpath), mel_prediction, allow_pickle=False)
|
246 |
-
|
247 |
-
# save griffin lim inverted wav for debug (mel -> wav)
|
248 |
-
wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
|
249 |
-
wav_fpath = wav_dir.joinpath("step-{}-wave-from-mel_sample_{}.wav".format(step, sample_num))
|
250 |
-
audio.save_wav(wav, str(wav_fpath), sr=hparams.sample_rate)
|
251 |
-
|
252 |
-
# save real and predicted mel-spectrogram plot to disk (control purposes)
|
253 |
-
spec_fpath = plot_dir.joinpath("step-{}-mel-spectrogram_sample_{}.png".format(step, sample_num))
|
254 |
-
title_str = "{}, {}, step={}, loss={:.5f}".format("Tacotron", time_string(), step, loss)
|
255 |
-
plot_spectrogram(mel_prediction, str(spec_fpath), title=title_str,
|
256 |
-
target_spectrogram=target_spectrogram,
|
257 |
-
max_len=target_spectrogram.size // hparams.num_mels)
|
258 |
-
print("Input at step {}: {}".format(step, sequence_to_text(input_seq)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/utils/__init__.py
DELETED
@@ -1,45 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
|
3 |
-
|
4 |
-
_output_ref = None
|
5 |
-
_replicas_ref = None
|
6 |
-
|
7 |
-
def data_parallel_workaround(model, *input):
|
8 |
-
global _output_ref
|
9 |
-
global _replicas_ref
|
10 |
-
device_ids = list(range(torch.cuda.device_count()))
|
11 |
-
output_device = device_ids[0]
|
12 |
-
replicas = torch.nn.parallel.replicate(model, device_ids)
|
13 |
-
# input.shape = (num_args, batch, ...)
|
14 |
-
inputs = torch.nn.parallel.scatter(input, device_ids)
|
15 |
-
# inputs.shape = (num_gpus, num_args, batch/num_gpus, ...)
|
16 |
-
replicas = replicas[:len(inputs)]
|
17 |
-
outputs = torch.nn.parallel.parallel_apply(replicas, inputs)
|
18 |
-
y_hat = torch.nn.parallel.gather(outputs, output_device)
|
19 |
-
_output_ref = outputs
|
20 |
-
_replicas_ref = replicas
|
21 |
-
return y_hat
|
22 |
-
|
23 |
-
|
24 |
-
class ValueWindow():
|
25 |
-
def __init__(self, window_size=100):
|
26 |
-
self._window_size = window_size
|
27 |
-
self._values = []
|
28 |
-
|
29 |
-
def append(self, x):
|
30 |
-
self._values = self._values[-(self._window_size - 1):] + [x]
|
31 |
-
|
32 |
-
@property
|
33 |
-
def sum(self):
|
34 |
-
return sum(self._values)
|
35 |
-
|
36 |
-
@property
|
37 |
-
def count(self):
|
38 |
-
return len(self._values)
|
39 |
-
|
40 |
-
@property
|
41 |
-
def average(self):
|
42 |
-
return self.sum / max(1, self.count)
|
43 |
-
|
44 |
-
def reset(self):
|
45 |
-
self._values = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/utils/_cmudict.py
DELETED
@@ -1,62 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
|
3 |
-
valid_symbols = [
|
4 |
-
"AA", "AA0", "AA1", "AA2", "AE", "AE0", "AE1", "AE2", "AH", "AH0", "AH1", "AH2",
|
5 |
-
"AO", "AO0", "AO1", "AO2", "AW", "AW0", "AW1", "AW2", "AY", "AY0", "AY1", "AY2",
|
6 |
-
"B", "CH", "D", "DH", "EH", "EH0", "EH1", "EH2", "ER", "ER0", "ER1", "ER2", "EY",
|
7 |
-
"EY0", "EY1", "EY2", "F", "G", "HH", "IH", "IH0", "IH1", "IH2", "IY", "IY0", "IY1",
|
8 |
-
"IY2", "JH", "K", "L", "M", "N", "NG", "OW", "OW0", "OW1", "OW2", "OY", "OY0",
|
9 |
-
"OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH", "UH0", "UH1", "UH2", "UW",
|
10 |
-
"UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH"
|
11 |
-
]
|
12 |
-
|
13 |
-
_valid_symbol_set = set(valid_symbols)
|
14 |
-
|
15 |
-
|
16 |
-
class CMUDict:
|
17 |
-
"""Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
|
18 |
-
def __init__(self, file_or_path, keep_ambiguous=True):
|
19 |
-
if isinstance(file_or_path, str):
|
20 |
-
with open(file_or_path, encoding="latin-1") as f:
|
21 |
-
entries = _parse_cmudict(f)
|
22 |
-
else:
|
23 |
-
entries = _parse_cmudict(file_or_path)
|
24 |
-
if not keep_ambiguous:
|
25 |
-
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
|
26 |
-
self._entries = entries
|
27 |
-
|
28 |
-
|
29 |
-
def __len__(self):
|
30 |
-
return len(self._entries)
|
31 |
-
|
32 |
-
|
33 |
-
def lookup(self, word):
|
34 |
-
"""Returns list of ARPAbet pronunciations of the given word."""
|
35 |
-
return self._entries.get(word.upper())
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
_alt_re = re.compile(r"\([0-9]+\)")
|
40 |
-
|
41 |
-
|
42 |
-
def _parse_cmudict(file):
|
43 |
-
cmudict = {}
|
44 |
-
for line in file:
|
45 |
-
if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
|
46 |
-
parts = line.split(" ")
|
47 |
-
word = re.sub(_alt_re, "", parts[0])
|
48 |
-
pronunciation = _get_pronunciation(parts[1])
|
49 |
-
if pronunciation:
|
50 |
-
if word in cmudict:
|
51 |
-
cmudict[word].append(pronunciation)
|
52 |
-
else:
|
53 |
-
cmudict[word] = [pronunciation]
|
54 |
-
return cmudict
|
55 |
-
|
56 |
-
|
57 |
-
def _get_pronunciation(s):
|
58 |
-
parts = s.strip().split(" ")
|
59 |
-
for part in parts:
|
60 |
-
if part not in _valid_symbol_set:
|
61 |
-
return None
|
62 |
-
return " ".join(parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/utils/cleaners.py
DELETED
@@ -1,88 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Cleaners are transformations that run over the input text at both training and eval time.
|
3 |
-
|
4 |
-
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
5 |
-
hyperparameter. Some cleaners are English-specific. You"ll typically want to use:
|
6 |
-
1. "english_cleaners" for English text
|
7 |
-
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
8 |
-
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
9 |
-
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
10 |
-
the symbols in symbols.py to match your data).
|
11 |
-
"""
|
12 |
-
import re
|
13 |
-
from unidecode import unidecode
|
14 |
-
from synthesizer.utils.numbers import normalize_numbers
|
15 |
-
|
16 |
-
|
17 |
-
# Regular expression matching whitespace:
|
18 |
-
_whitespace_re = re.compile(r"\s+")
|
19 |
-
|
20 |
-
# List of (regular expression, replacement) pairs for abbreviations:
|
21 |
-
_abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [
|
22 |
-
("mrs", "misess"),
|
23 |
-
("mr", "mister"),
|
24 |
-
("dr", "doctor"),
|
25 |
-
("st", "saint"),
|
26 |
-
("co", "company"),
|
27 |
-
("jr", "junior"),
|
28 |
-
("maj", "major"),
|
29 |
-
("gen", "general"),
|
30 |
-
("drs", "doctors"),
|
31 |
-
("rev", "reverend"),
|
32 |
-
("lt", "lieutenant"),
|
33 |
-
("hon", "honorable"),
|
34 |
-
("sgt", "sergeant"),
|
35 |
-
("capt", "captain"),
|
36 |
-
("esq", "esquire"),
|
37 |
-
("ltd", "limited"),
|
38 |
-
("col", "colonel"),
|
39 |
-
("ft", "fort"),
|
40 |
-
]]
|
41 |
-
|
42 |
-
|
43 |
-
def expand_abbreviations(text):
|
44 |
-
for regex, replacement in _abbreviations:
|
45 |
-
text = re.sub(regex, replacement, text)
|
46 |
-
return text
|
47 |
-
|
48 |
-
|
49 |
-
def expand_numbers(text):
|
50 |
-
return normalize_numbers(text)
|
51 |
-
|
52 |
-
|
53 |
-
def lowercase(text):
|
54 |
-
"""lowercase input tokens."""
|
55 |
-
return text.lower()
|
56 |
-
|
57 |
-
|
58 |
-
def collapse_whitespace(text):
|
59 |
-
return re.sub(_whitespace_re, " ", text)
|
60 |
-
|
61 |
-
|
62 |
-
def convert_to_ascii(text):
|
63 |
-
return unidecode(text)
|
64 |
-
|
65 |
-
|
66 |
-
def basic_cleaners(text):
|
67 |
-
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
|
68 |
-
text = lowercase(text)
|
69 |
-
text = collapse_whitespace(text)
|
70 |
-
return text
|
71 |
-
|
72 |
-
|
73 |
-
def transliteration_cleaners(text):
|
74 |
-
"""Pipeline for non-English text that transliterates to ASCII."""
|
75 |
-
text = convert_to_ascii(text)
|
76 |
-
text = lowercase(text)
|
77 |
-
text = collapse_whitespace(text)
|
78 |
-
return text
|
79 |
-
|
80 |
-
|
81 |
-
def english_cleaners(text):
|
82 |
-
"""Pipeline for English text, including number and abbreviation expansion."""
|
83 |
-
text = convert_to_ascii(text)
|
84 |
-
text = lowercase(text)
|
85 |
-
text = expand_numbers(text)
|
86 |
-
text = expand_abbreviations(text)
|
87 |
-
text = collapse_whitespace(text)
|
88 |
-
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/utils/numbers.py
DELETED
@@ -1,69 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
import inflect
|
3 |
-
|
4 |
-
|
5 |
-
_inflect = inflect.engine()
|
6 |
-
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
|
7 |
-
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
|
8 |
-
_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
|
9 |
-
_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
|
10 |
-
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
|
11 |
-
_number_re = re.compile(r"[0-9]+")
|
12 |
-
|
13 |
-
|
14 |
-
def _remove_commas(m):
|
15 |
-
return m.group(1).replace(",", "")
|
16 |
-
|
17 |
-
|
18 |
-
def _expand_decimal_point(m):
|
19 |
-
return m.group(1).replace(".", " point ")
|
20 |
-
|
21 |
-
|
22 |
-
def _expand_dollars(m):
|
23 |
-
match = m.group(1)
|
24 |
-
parts = match.split(".")
|
25 |
-
if len(parts) > 2:
|
26 |
-
return match + " dollars" # Unexpected format
|
27 |
-
dollars = int(parts[0]) if parts[0] else 0
|
28 |
-
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
29 |
-
if dollars and cents:
|
30 |
-
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
31 |
-
cent_unit = "cent" if cents == 1 else "cents"
|
32 |
-
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
|
33 |
-
elif dollars:
|
34 |
-
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
35 |
-
return "%s %s" % (dollars, dollar_unit)
|
36 |
-
elif cents:
|
37 |
-
cent_unit = "cent" if cents == 1 else "cents"
|
38 |
-
return "%s %s" % (cents, cent_unit)
|
39 |
-
else:
|
40 |
-
return "zero dollars"
|
41 |
-
|
42 |
-
|
43 |
-
def _expand_ordinal(m):
|
44 |
-
return _inflect.number_to_words(m.group(0))
|
45 |
-
|
46 |
-
|
47 |
-
def _expand_number(m):
|
48 |
-
num = int(m.group(0))
|
49 |
-
if num > 1000 and num < 3000:
|
50 |
-
if num == 2000:
|
51 |
-
return "two thousand"
|
52 |
-
elif num > 2000 and num < 2010:
|
53 |
-
return "two thousand " + _inflect.number_to_words(num % 100)
|
54 |
-
elif num % 100 == 0:
|
55 |
-
return _inflect.number_to_words(num // 100) + " hundred"
|
56 |
-
else:
|
57 |
-
return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
|
58 |
-
else:
|
59 |
-
return _inflect.number_to_words(num, andword="")
|
60 |
-
|
61 |
-
|
62 |
-
def normalize_numbers(text):
|
63 |
-
text = re.sub(_comma_number_re, _remove_commas, text)
|
64 |
-
text = re.sub(_pounds_re, r"\1 pounds", text)
|
65 |
-
text = re.sub(_dollars_re, _expand_dollars, text)
|
66 |
-
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
67 |
-
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
68 |
-
text = re.sub(_number_re, _expand_number, text)
|
69 |
-
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/utils/plot.py
DELETED
@@ -1,82 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
|
3 |
-
|
4 |
-
def split_title_line(title_text, max_words=5):
|
5 |
-
"""
|
6 |
-
A function that splits any string based on specific character
|
7 |
-
(returning it with the string), with maximum number of words on it
|
8 |
-
"""
|
9 |
-
seq = title_text.split()
|
10 |
-
return "\n".join([" ".join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
|
11 |
-
|
12 |
-
|
13 |
-
def plot_alignment(alignment, path, title=None, split_title=False, max_len=None):
|
14 |
-
import matplotlib
|
15 |
-
matplotlib.use("Agg")
|
16 |
-
import matplotlib.pyplot as plt
|
17 |
-
|
18 |
-
if max_len is not None:
|
19 |
-
alignment = alignment[:, :max_len]
|
20 |
-
|
21 |
-
fig = plt.figure(figsize=(8, 6))
|
22 |
-
ax = fig.add_subplot(111)
|
23 |
-
|
24 |
-
im = ax.imshow(
|
25 |
-
alignment,
|
26 |
-
aspect="auto",
|
27 |
-
origin="lower",
|
28 |
-
interpolation="none")
|
29 |
-
fig.colorbar(im, ax=ax)
|
30 |
-
xlabel = "Decoder timestep"
|
31 |
-
|
32 |
-
if split_title:
|
33 |
-
title = split_title_line(title)
|
34 |
-
|
35 |
-
plt.xlabel(xlabel)
|
36 |
-
plt.title(title)
|
37 |
-
plt.ylabel("Encoder timestep")
|
38 |
-
plt.tight_layout()
|
39 |
-
plt.savefig(path, format="png")
|
40 |
-
plt.close()
|
41 |
-
|
42 |
-
|
43 |
-
def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
|
44 |
-
import matplotlib
|
45 |
-
matplotlib.use("Agg")
|
46 |
-
import matplotlib.pyplot as plt
|
47 |
-
|
48 |
-
if max_len is not None:
|
49 |
-
target_spectrogram = target_spectrogram[:max_len]
|
50 |
-
pred_spectrogram = pred_spectrogram[:max_len]
|
51 |
-
|
52 |
-
if split_title:
|
53 |
-
title = split_title_line(title)
|
54 |
-
|
55 |
-
fig = plt.figure(figsize=(10, 8))
|
56 |
-
# Set common labels
|
57 |
-
fig.text(0.5, 0.18, title, horizontalalignment="center", fontsize=16)
|
58 |
-
|
59 |
-
#target spectrogram subplot
|
60 |
-
if target_spectrogram is not None:
|
61 |
-
ax1 = fig.add_subplot(311)
|
62 |
-
ax2 = fig.add_subplot(312)
|
63 |
-
|
64 |
-
if auto_aspect:
|
65 |
-
im = ax1.imshow(np.rot90(target_spectrogram), aspect="auto", interpolation="none")
|
66 |
-
else:
|
67 |
-
im = ax1.imshow(np.rot90(target_spectrogram), interpolation="none")
|
68 |
-
ax1.set_title("Target Mel-Spectrogram")
|
69 |
-
fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax1)
|
70 |
-
ax2.set_title("Predicted Mel-Spectrogram")
|
71 |
-
else:
|
72 |
-
ax2 = fig.add_subplot(211)
|
73 |
-
|
74 |
-
if auto_aspect:
|
75 |
-
im = ax2.imshow(np.rot90(pred_spectrogram), aspect="auto", interpolation="none")
|
76 |
-
else:
|
77 |
-
im = ax2.imshow(np.rot90(pred_spectrogram), interpolation="none")
|
78 |
-
fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax2)
|
79 |
-
|
80 |
-
plt.tight_layout()
|
81 |
-
plt.savefig(path, format="png")
|
82 |
-
plt.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/utils/symbols.py
DELETED
@@ -1,17 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Defines the set of symbols used in text input to the model.
|
3 |
-
|
4 |
-
The default is a set of ASCII characters that works well for English or text that has been run
|
5 |
-
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
|
6 |
-
"""
|
7 |
-
# from . import cmudict
|
8 |
-
|
9 |
-
_pad = "_"
|
10 |
-
_eos = "~"
|
11 |
-
_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'\"(),-.:;? "
|
12 |
-
|
13 |
-
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
14 |
-
#_arpabet = ["@' + s for s in cmudict.valid_symbols]
|
15 |
-
|
16 |
-
# Export all symbols:
|
17 |
-
symbols = [_pad, _eos] + list(_characters) #+ _arpabet
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthesizer/utils/text.py
DELETED
@@ -1,75 +0,0 @@
|
|
1 |
-
from synthesizer.utils.symbols import symbols
|
2 |
-
from synthesizer.utils import cleaners
|
3 |
-
import re
|
4 |
-
|
5 |
-
|
6 |
-
# Mappings from symbol to numeric ID and vice versa:
|
7 |
-
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
8 |
-
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
9 |
-
|
10 |
-
# Regular expression matching text enclosed in curly braces:
|
11 |
-
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
|
12 |
-
|
13 |
-
|
14 |
-
def text_to_sequence(text, cleaner_names):
|
15 |
-
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
16 |
-
|
17 |
-
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
18 |
-
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
|
19 |
-
|
20 |
-
Args:
|
21 |
-
text: string to convert to a sequence
|
22 |
-
cleaner_names: names of the cleaner functions to run the text through
|
23 |
-
|
24 |
-
Returns:
|
25 |
-
List of integers corresponding to the symbols in the text
|
26 |
-
"""
|
27 |
-
sequence = []
|
28 |
-
|
29 |
-
# Check for curly braces and treat their contents as ARPAbet:
|
30 |
-
while len(text):
|
31 |
-
m = _curly_re.match(text)
|
32 |
-
if not m:
|
33 |
-
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
|
34 |
-
break
|
35 |
-
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
|
36 |
-
sequence += _arpabet_to_sequence(m.group(2))
|
37 |
-
text = m.group(3)
|
38 |
-
|
39 |
-
# Append EOS token
|
40 |
-
sequence.append(_symbol_to_id["~"])
|
41 |
-
return sequence
|
42 |
-
|
43 |
-
|
44 |
-
def sequence_to_text(sequence):
|
45 |
-
"""Converts a sequence of IDs back to a string"""
|
46 |
-
result = ""
|
47 |
-
for symbol_id in sequence:
|
48 |
-
if symbol_id in _id_to_symbol:
|
49 |
-
s = _id_to_symbol[symbol_id]
|
50 |
-
# Enclose ARPAbet back in curly braces:
|
51 |
-
if len(s) > 1 and s[0] == "@":
|
52 |
-
s = "{%s}" % s[1:]
|
53 |
-
result += s
|
54 |
-
return result.replace("}{", " ")
|
55 |
-
|
56 |
-
|
57 |
-
def _clean_text(text, cleaner_names):
|
58 |
-
for name in cleaner_names:
|
59 |
-
cleaner = getattr(cleaners, name)
|
60 |
-
if not cleaner:
|
61 |
-
raise Exception("Unknown cleaner: %s" % name)
|
62 |
-
text = cleaner(text)
|
63 |
-
return text
|
64 |
-
|
65 |
-
|
66 |
-
def _symbols_to_sequence(symbols):
|
67 |
-
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
|
68 |
-
|
69 |
-
|
70 |
-
def _arpabet_to_sequence(text):
|
71 |
-
return _symbols_to_sequence(["@" + s for s in text.split()])
|
72 |
-
|
73 |
-
|
74 |
-
def _should_keep_symbol(s):
|
75 |
-
return s in _symbol_to_id and s not in ("_", "~")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|