Upload 61 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -34
- .gitignore +20 -0
- 1320_00000.mp3 +0 -0
- 3575_00000.mp3 +0 -0
- 6829_00000.mp3 +0 -0
- 8230_00000.mp3 +0 -0
- Kor_preprocess.py +178 -0
- LICENSE.txt +24 -0
- README.md +20 -10
- VCTK.txt +94 -0
- __init__.py +0 -0
- _cmudict.py +62 -0
- argutils.py +40 -0
- audio.py +108 -0
- cleaners.py +96 -0
- config.py +45 -0
- deepmind_version.py +170 -0
- demo_cli.py +225 -0
- demo_toolbox.py +43 -0
- display.py +120 -0
- distribution.py +132 -0
- encoder_preprocess.py +70 -0
- encoder_train.py +47 -0
- fatchord_version.py +434 -0
- gen_wavernn.py +31 -0
- hparams.py +44 -0
- inference.py +64 -0
- ko_dictionary.py +174 -0
- korean.py +349 -0
- logmmse.py +247 -0
- model.py +135 -0
- modelutils.py +17 -0
- numbers.py +68 -0
- p240_00000.mp3 +0 -0
- p260_00000.mp3 +0 -0
- params_data.py +29 -0
- params_model.py +11 -0
- plot.py +76 -0
- preprocess.py +259 -0
- preprocess_kspon.py +65 -0
- profiler.py +45 -0
- random_cycler.py +37 -0
- requirements.txt +19 -0
- speaker.py +40 -0
- speaker_batch.py +12 -0
- speaker_verification_dataset.py +56 -0
- symbols.py +20 -0
- synthesize.py +97 -0
- synthesizer_dataset.py +96 -0
- synthesizer_preprocess_audio.py +42 -0
.gitattributes
CHANGED
@@ -1,34 +1 @@
|
|
1 |
-
*.
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.ipynb linguist-vendored
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.pyc
|
2 |
+
*.aux
|
3 |
+
*.log
|
4 |
+
*.out
|
5 |
+
*.synctex.gz
|
6 |
+
*.suo
|
7 |
+
*__pycache__
|
8 |
+
*.idea
|
9 |
+
*.ipynb_checkpoints
|
10 |
+
*.pickle
|
11 |
+
*.npy
|
12 |
+
*.blg
|
13 |
+
*.bbl
|
14 |
+
*.bcf
|
15 |
+
*.toc
|
16 |
+
*.wav
|
17 |
+
*.sh
|
18 |
+
encoder/saved_models/*
|
19 |
+
synthesizer/saved_models/*
|
20 |
+
vocoder/saved_models/*
|
1320_00000.mp3
ADDED
Binary file (15.5 kB). View file
|
|
3575_00000.mp3
ADDED
Binary file (15.5 kB). View file
|
|
6829_00000.mp3
ADDED
Binary file (15.6 kB). View file
|
|
8230_00000.mp3
ADDED
Binary file (16.1 kB). View file
|
|
Kor_preprocess.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from multiprocessing.pool import Pool
|
2 |
+
from synthesizer import audio
|
3 |
+
from functools import partial
|
4 |
+
from itertools import chain
|
5 |
+
from encoder import inference as encoder
|
6 |
+
from pathlib import Path
|
7 |
+
from utils import logmmse
|
8 |
+
from tqdm import tqdm
|
9 |
+
import numpy as np
|
10 |
+
import librosa
|
11 |
+
import os
|
12 |
+
##
|
13 |
+
|
14 |
+
|
15 |
+
def preprocess_KSponSpeech(datasets_root: Path, out_dir: Path, n_processes: int,
|
16 |
+
skip_existing: bool, hparams):
|
17 |
+
# Gather the input directories
|
18 |
+
dataset_root = datasets_root.joinpath("KSponSpeech")
|
19 |
+
input_dirs = [dataset_root.joinpath("KsponSpeech_01"),
|
20 |
+
dataset_root.joinpath("KsponSpeech_02"),
|
21 |
+
dataset_root.joinpath("KsponSpeech_03"),
|
22 |
+
dataset_root.joinpath("KsponSpeech_04"),
|
23 |
+
dataset_root.joinpath("KsponSpeech_05")]
|
24 |
+
|
25 |
+
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
|
26 |
+
assert all(input_dir.exists() for input_dir in input_dirs)
|
27 |
+
|
28 |
+
# Create the output directories for each output file type
|
29 |
+
out_dir.joinpath("mels").mkdir(exist_ok=True)
|
30 |
+
out_dir.joinpath("audio").mkdir(exist_ok=True)
|
31 |
+
|
32 |
+
# Create a metadata file
|
33 |
+
metadata_fpath = out_dir.joinpath("train.txt")
|
34 |
+
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="cp949")
|
35 |
+
|
36 |
+
# Preprocess the dataset
|
37 |
+
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) # 폴더안의 모든 폴더(Speaker)
|
38 |
+
func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing,
|
39 |
+
hparams=hparams)
|
40 |
+
job = Pool(n_processes).imap(func, speaker_dirs)
|
41 |
+
for speaker_metadata in tqdm(job, "KSponSpeech", len(speaker_dirs), unit="speakers"):
|
42 |
+
for metadatum in speaker_metadata:
|
43 |
+
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
|
44 |
+
metadata_file.close()
|
45 |
+
|
46 |
+
# Verify the contents of the metadata file
|
47 |
+
with metadata_fpath.open("r", encoding="cp949") as metadata_file:
|
48 |
+
metadata = [line.split("|") for line in metadata_file]
|
49 |
+
mel_frames = sum([int(m[4]) for m in metadata])
|
50 |
+
timesteps = sum([int(m[3]) for m in metadata])
|
51 |
+
sample_rate = hparams.sample_rate
|
52 |
+
hours = (timesteps / sample_rate) / 3600
|
53 |
+
print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
|
54 |
+
(len(metadata), mel_frames, timesteps, hours))
|
55 |
+
print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
|
56 |
+
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
57 |
+
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
58 |
+
|
59 |
+
|
60 |
+
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams):
|
61 |
+
metadata = []
|
62 |
+
check_list = [",01", ",02", ",03", ",04", ",05", ",06", ",07", ",08", ",09"]
|
63 |
+
# Gather the utterance audios and texts
|
64 |
+
# try:
|
65 |
+
files = os.listdir(speaker_dir)
|
66 |
+
|
67 |
+
for file in files:
|
68 |
+
if file.endswith("alignment.txt"):
|
69 |
+
with open(os.path.join(speaker_dir, file), "r", encoding='cp949') as alignments_file:
|
70 |
+
alignments = [line.rstrip().split(" ") for line in alignments_file.readlines()]
|
71 |
+
# # except StopIteration:
|
72 |
+
# # # A few alignment files will be missing
|
73 |
+
# # continue
|
74 |
+
|
75 |
+
# Iterate over each entry in the alignments file
|
76 |
+
for wav_fname, words in alignments:
|
77 |
+
|
78 |
+
for check in check_list:
|
79 |
+
if check in words:
|
80 |
+
print(words)
|
81 |
+
words = "pass"
|
82 |
+
|
83 |
+
wav_fpath = speaker_dir.joinpath(wav_fname + ".pcm")
|
84 |
+
assert wav_fpath.exists()
|
85 |
+
# words = words.replace("\"", "").split(",")
|
86 |
+
# end_times = list(map(float, end_times.replace("\"", "").split(",")))
|
87 |
+
#
|
88 |
+
# # Process each sub-utterance
|
89 |
+
wavs = normalization(wav_fpath, hparams)
|
90 |
+
|
91 |
+
if wavs is not None and words is not "pass":
|
92 |
+
sub_basename = "%s" % (wav_fname)
|
93 |
+
metadata.append(process_utterance(wavs, words, out_dir, sub_basename,
|
94 |
+
skip_existing, hparams))
|
95 |
+
|
96 |
+
return [m for m in metadata if m is not None]
|
97 |
+
|
98 |
+
|
99 |
+
def normalization(wav_fpath, hparams):
|
100 |
+
try:
|
101 |
+
wav = np.memmap(wav_fpath, dtype='h', mode='r')
|
102 |
+
if hparams.rescale:
|
103 |
+
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
104 |
+
except EOFError:
|
105 |
+
print(wav_fpath)
|
106 |
+
return None
|
107 |
+
return wav
|
108 |
+
|
109 |
+
|
110 |
+
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
111 |
+
skip_existing: bool, hparams):
|
112 |
+
## FOR REFERENCE:
|
113 |
+
# For you not to lose your head if you ever wish to change things here or implement your own
|
114 |
+
# synthesizer.
|
115 |
+
# - Both the audios and the mel spectrograms are saved as numpy arrays
|
116 |
+
# - There is no processing done to the audios that will be saved to disk beyond volume
|
117 |
+
# normalization (in split_on_silences)
|
118 |
+
# - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
|
119 |
+
# is why we re-apply it on the audio on the side of the vocoder.
|
120 |
+
# - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
|
121 |
+
# without extra padding. This means that you won't have an exact relation between the length
|
122 |
+
# of the wav and of the mel spectrogram. See the vocoder data loader.
|
123 |
+
|
124 |
+
# Skip existing utterances if needed
|
125 |
+
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
|
126 |
+
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
|
127 |
+
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
|
128 |
+
return None
|
129 |
+
|
130 |
+
# Skip utterances that are too short
|
131 |
+
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
|
132 |
+
return None
|
133 |
+
|
134 |
+
# Compute the mel spectrogram
|
135 |
+
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
136 |
+
mel_frames = mel_spectrogram.shape[1]
|
137 |
+
|
138 |
+
# Skip utterances that are too long
|
139 |
+
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
|
140 |
+
return None
|
141 |
+
|
142 |
+
# Write the spectrogram, embed and audio to disk
|
143 |
+
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
|
144 |
+
np.save(wav_fpath, wav, allow_pickle=False)
|
145 |
+
|
146 |
+
# Return a tuple describing this training example
|
147 |
+
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
|
148 |
+
|
149 |
+
|
150 |
+
def embed_utterance(fpaths, encoder_model_fpath):
|
151 |
+
if not encoder.is_loaded():
|
152 |
+
encoder.load_model(encoder_model_fpath)
|
153 |
+
|
154 |
+
# Compute the speaker embedding of the utterance
|
155 |
+
wav_fpath, embed_fpath = fpaths
|
156 |
+
wav = np.load(wav_fpath)
|
157 |
+
wav = encoder.preprocess_wav(wav)
|
158 |
+
embed = encoder.embed_utterance(wav)
|
159 |
+
np.save(embed_fpath, embed, allow_pickle=False)
|
160 |
+
|
161 |
+
|
162 |
+
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
|
163 |
+
wav_dir = synthesizer_root.joinpath("audio")
|
164 |
+
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
165 |
+
assert wav_dir.exists() and metadata_fpath.exists()
|
166 |
+
embed_dir = synthesizer_root.joinpath("embeds")
|
167 |
+
embed_dir.mkdir(exist_ok=True)
|
168 |
+
|
169 |
+
# Gather the input wave filepath and the target output embed filepath
|
170 |
+
with metadata_fpath.open("r") as metadata_file:
|
171 |
+
metadata = [line.split("|") for line in metadata_file]
|
172 |
+
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
|
173 |
+
|
174 |
+
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
175 |
+
# Embed the utterances in separate threads
|
176 |
+
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
177 |
+
job = Pool(n_processes).imap(func, fpaths)
|
178 |
+
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
LICENSE.txt
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
|
4 |
+
Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
|
5 |
+
Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
|
6 |
+
Original work Copyright (c) 2015 braindead (https://github.com/braindead)
|
7 |
+
|
8 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
9 |
+
of this software and associated documentation files (the "Software"), to deal
|
10 |
+
in the Software without restriction, including without limitation the rights
|
11 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12 |
+
copies of the Software, and to permit persons to whom the Software is
|
13 |
+
furnished to do so, subject to the following conditions:
|
14 |
+
|
15 |
+
The above copyright notice and this permission notice shall be included in all
|
16 |
+
copies or substantial portions of the Software.
|
17 |
+
|
18 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
19 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
20 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
21 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
22 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
23 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
24 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,10 +1,20 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
** Temporarily suspended
|
2 |
+
|
3 |
+
|
4 |
+
|
5 |
+
# Real-Time Korean Voice Cloning
|
6 |
+
This repository is Korean version of sv2tts. The original model (which was developed by CorentinJ(https://github.com/CorentinJ/Real-Time-Voice-Cloning)) is based on English.
|
7 |
+
To implement Korean speech on the model, I refer to tail95(https://github.com/tail95/Voice-Cloning).
|
8 |
+
I changed some codes to improve convenience in preprocessing(audio and text) and training. Also I converted tensorflow model to pytorch model and fixed some errors.
|
9 |
+
|
10 |
+
## References
|
11 |
+
- https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
12 |
+
- https://github.com/tail95/Voice-Cloning
|
13 |
+
- https://medium.com/analytics-vidhya/the-intuition-behind-voice-cloning-with-5-seconds-of-audio-5989e9b2e042
|
14 |
+
- Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis (https://arxiv.org/abs/1806.04558)
|
15 |
+
|
16 |
+
|
17 |
+
## Used Dataset
|
18 |
+
- KSponspeech (https://aihub.or.kr/aidata/105)
|
19 |
+
|
20 |
+
Make sure that your datasets has text-audio pairs.
|
VCTK.txt
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---------------------------------------------------------------------
|
2 |
+
CSTR VCTK Corpus
|
3 |
+
English Multi-speaker Corpus for CSTR Voice Cloning Toolkit
|
4 |
+
|
5 |
+
(Version 0.92)
|
6 |
+
RELEASE September 2019
|
7 |
+
The Centre for Speech Technology Research
|
8 |
+
University of Edinburgh
|
9 |
+
Copyright (c) 2019
|
10 |
+
|
11 |
+
Junichi Yamagishi
|
12 | |
13 |
+
---------------------------------------------------------------------
|
14 |
+
|
15 |
+
Overview
|
16 |
+
|
17 |
+
This CSTR VCTK Corpus includes speech data uttered by 110 English
|
18 |
+
speakers with various accents. Each speaker reads out about 400
|
19 |
+
sentences, which were selected from a newspaper, the rainbow passage
|
20 |
+
and an elicitation paragraph used for the speech accent archive.
|
21 |
+
|
22 |
+
The newspaper texts were taken from Herald Glasgow, with permission
|
23 |
+
from Herald & Times Group. Each speaker has a different set of the
|
24 |
+
newspaper texts selected based a greedy algorithm that increases the
|
25 |
+
contextual and phonetic coverage. The details of the text selection
|
26 |
+
algorithms are described in the following paper:
|
27 |
+
|
28 |
+
C. Veaux, J. Yamagishi and S. King,
|
29 |
+
"The voice bank corpus: Design, collection and data analysis of
|
30 |
+
a large regional accent speech database,"
|
31 |
+
https://doi.org/10.1109/ICSDA.2013.6709856
|
32 |
+
|
33 |
+
The rainbow passage and elicitation paragraph are the same for all
|
34 |
+
speakers. The rainbow passage can be found at International Dialects
|
35 |
+
of English Archive:
|
36 |
+
(http://web.ku.edu/~idea/readings/rainbow.htm). The elicitation
|
37 |
+
paragraph is identical to the one used for the speech accent archive
|
38 |
+
(http://accent.gmu.edu). The details of the the speech accent archive
|
39 |
+
can be found at
|
40 |
+
http://www.ualberta.ca/~aacl2009/PDFs/WeinbergerKunath2009AACL.pdf
|
41 |
+
|
42 |
+
All speech data was recorded using an identical recording setup: an
|
43 |
+
omni-directional microphone (DPA 4035) and a small diaphragm condenser
|
44 |
+
microphone with very wide bandwidth (Sennheiser MKH 800), 96kHz
|
45 |
+
sampling frequency at 24 bits and in a hemi-anechoic chamber of
|
46 |
+
the University of Edinburgh. (However, two speakers, p280 and p315
|
47 |
+
had technical issues of the audio recordings using MKH 800).
|
48 |
+
All recordings were converted into 16 bits, were downsampled to
|
49 |
+
48 kHz, and were manually end-pointed.
|
50 |
+
|
51 |
+
This corpus was originally aimed for HMM-based text-to-speech synthesis
|
52 |
+
systems, especially for speaker-adaptive HMM-based speech synthesis
|
53 |
+
that uses average voice models trained on multiple speakers and speaker
|
54 |
+
adaptation technologies. This corpus is also suitable for DNN-based
|
55 |
+
multi-speaker text-to-speech synthesis systems and waveform modeling.
|
56 |
+
|
57 |
+
COPYING
|
58 |
+
|
59 |
+
This corpus is licensed under the Creative Commons License: Attribution 4.0 International
|
60 |
+
http://creativecommons.org/licenses/by/4.0/legalcode
|
61 |
+
|
62 |
+
VCTK VARIANTS
|
63 |
+
There are several variants of the VCTK corpus:
|
64 |
+
Speech enhancement
|
65 |
+
- Noisy speech database for training speech enhancement algorithms and TTS models where we added various types of noises to VCTK artificially: http://dx.doi.org/10.7488/ds/2117
|
66 |
+
- Reverberant speech database for training speech dereverberation algorithms and TTS models where we added various types of reverberantion to VCTK artificially http://dx.doi.org/10.7488/ds/1425
|
67 |
+
- Noisy reverberant speech database for training speech enhancement algorithms and TTS models http://dx.doi.org/10.7488/ds/2139
|
68 |
+
- Device Recorded VCTK where speech signals of the VCTK corpus were played back and re-recorded in office environments using relatively inexpensive consumer devices http://dx.doi.org/10.7488/ds/2316
|
69 |
+
- The Microsoft Scalable Noisy Speech Dataset (MS-SNSD) https://github.com/microsoft/MS-SNSD
|
70 |
+
|
71 |
+
ASV and anti-spoofing
|
72 |
+
- Spoofing and Anti-Spoofing (SAS) corpus, which is a collection of synthetic speech signals produced by nine techniques, two of which are speech synthesis, and seven are voice conversion. All of them were built using the VCTK corpus. http://dx.doi.org/10.7488/ds/252
|
73 |
+
- Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2015) Database. This database consists of synthetic speech signals produced by ten techniques and this has been used in the first Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2015) http://dx.doi.org/10.7488/ds/298
|
74 |
+
- ASVspoof 2019: The 3rd Automatic Speaker Verification Spoofing and Countermeasures Challenge database. This database has been used in the 3rd Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2019) https://doi.org/10.7488/ds/2555
|
75 |
+
|
76 |
+
|
77 |
+
ACKNOWLEDGEMENTS
|
78 |
+
|
79 |
+
The CSTR VCTK Corpus was constructed by:
|
80 |
+
|
81 |
+
Christophe Veaux (University of Edinburgh)
|
82 |
+
Junichi Yamagishi (University of Edinburgh)
|
83 |
+
Kirsten MacDonald
|
84 |
+
|
85 |
+
The research leading to these results was partly funded from EPSRC
|
86 |
+
grants EP/I031022/1 (NST) and EP/J002526/1 (CAF), from the RSE-NSFC
|
87 |
+
grant (61111130120), and from the JST CREST (uDialogue).
|
88 |
+
|
89 |
+
Please cite this corpus as follows:
|
90 |
+
Christophe Veaux, Junichi Yamagishi, Kirsten MacDonald,
|
91 |
+
"CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit",
|
92 |
+
The Centre for Speech Technology Research (CSTR),
|
93 |
+
University of Edinburgh
|
94 |
+
|
__init__.py
ADDED
File without changes
|
_cmudict.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
valid_symbols = [
|
4 |
+
"AA", "AA0", "AA1", "AA2", "AE", "AE0", "AE1", "AE2", "AH", "AH0", "AH1", "AH2",
|
5 |
+
"AO", "AO0", "AO1", "AO2", "AW", "AW0", "AW1", "AW2", "AY", "AY0", "AY1", "AY2",
|
6 |
+
"B", "CH", "D", "DH", "EH", "EH0", "EH1", "EH2", "ER", "ER0", "ER1", "ER2", "EY",
|
7 |
+
"EY0", "EY1", "EY2", "F", "G", "HH", "IH", "IH0", "IH1", "IH2", "IY", "IY0", "IY1",
|
8 |
+
"IY2", "JH", "K", "L", "M", "N", "NG", "OW", "OW0", "OW1", "OW2", "OY", "OY0",
|
9 |
+
"OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH", "UH0", "UH1", "UH2", "UW",
|
10 |
+
"UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH"
|
11 |
+
]
|
12 |
+
|
13 |
+
_valid_symbol_set = set(valid_symbols)
|
14 |
+
|
15 |
+
|
16 |
+
class CMUDict:
|
17 |
+
"""Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
|
18 |
+
def __init__(self, file_or_path, keep_ambiguous=True):
|
19 |
+
if isinstance(file_or_path, str):
|
20 |
+
with open(file_or_path, encoding="latin-1") as f:
|
21 |
+
entries = _parse_cmudict(f)
|
22 |
+
else:
|
23 |
+
entries = _parse_cmudict(file_or_path)
|
24 |
+
if not keep_ambiguous:
|
25 |
+
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
|
26 |
+
self._entries = entries
|
27 |
+
|
28 |
+
|
29 |
+
def __len__(self):
|
30 |
+
return len(self._entries)
|
31 |
+
|
32 |
+
|
33 |
+
def lookup(self, word):
|
34 |
+
"""Returns list of ARPAbet pronunciations of the given word."""
|
35 |
+
return self._entries.get(word.upper())
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
_alt_re = re.compile(r"\([0-9]+\)")
|
40 |
+
|
41 |
+
|
42 |
+
def _parse_cmudict(file):
|
43 |
+
cmudict = {}
|
44 |
+
for line in file:
|
45 |
+
if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
|
46 |
+
parts = line.split(" ")
|
47 |
+
word = re.sub(_alt_re, "", parts[0])
|
48 |
+
pronunciation = _get_pronunciation(parts[1])
|
49 |
+
if pronunciation:
|
50 |
+
if word in cmudict:
|
51 |
+
cmudict[word].append(pronunciation)
|
52 |
+
else:
|
53 |
+
cmudict[word] = [pronunciation]
|
54 |
+
return cmudict
|
55 |
+
|
56 |
+
|
57 |
+
def _get_pronunciation(s):
|
58 |
+
parts = s.strip().split(" ")
|
59 |
+
for part in parts:
|
60 |
+
if part not in _valid_symbol_set:
|
61 |
+
return None
|
62 |
+
return " ".join(parts)
|
argutils.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import numpy as np
|
3 |
+
import argparse
|
4 |
+
|
5 |
+
_type_priorities = [ # In decreasing order
|
6 |
+
Path,
|
7 |
+
str,
|
8 |
+
int,
|
9 |
+
float,
|
10 |
+
bool,
|
11 |
+
]
|
12 |
+
|
13 |
+
def _priority(o):
|
14 |
+
p = next((i for i, t in enumerate(_type_priorities) if type(o) is t), None)
|
15 |
+
if p is not None:
|
16 |
+
return p
|
17 |
+
p = next((i for i, t in enumerate(_type_priorities) if isinstance(o, t)), None)
|
18 |
+
if p is not None:
|
19 |
+
return p
|
20 |
+
return len(_type_priorities)
|
21 |
+
|
22 |
+
def print_args(args: argparse.Namespace, parser=None):
|
23 |
+
args = vars(args)
|
24 |
+
if parser is None:
|
25 |
+
priorities = list(map(_priority, args.values()))
|
26 |
+
else:
|
27 |
+
all_params = [a.dest for g in parser._action_groups for a in g._group_actions ]
|
28 |
+
priority = lambda p: all_params.index(p) if p in all_params else len(all_params)
|
29 |
+
priorities = list(map(priority, args.keys()))
|
30 |
+
|
31 |
+
pad = max(map(len, args.keys())) + 3
|
32 |
+
indices = np.lexsort((list(args.keys()), priorities))
|
33 |
+
items = list(args.items())
|
34 |
+
|
35 |
+
print("Arguments:")
|
36 |
+
for i in indices:
|
37 |
+
param, value = items[i]
|
38 |
+
print(" {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value))
|
39 |
+
print("")
|
40 |
+
|
audio.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import numpy as np
|
3 |
+
import librosa
|
4 |
+
import vocoder.hparams as hp
|
5 |
+
from scipy.signal import lfilter
|
6 |
+
import soundfile as sf
|
7 |
+
|
8 |
+
|
9 |
+
def label_2_float(x, bits) :
|
10 |
+
return 2 * x / (2**bits - 1.) - 1.
|
11 |
+
|
12 |
+
|
13 |
+
def float_2_label(x, bits) :
|
14 |
+
assert abs(x).max() <= 1.0
|
15 |
+
x = (x + 1.) * (2**bits - 1) / 2
|
16 |
+
return x.clip(0, 2**bits - 1)
|
17 |
+
|
18 |
+
|
19 |
+
def load_wav(path) :
|
20 |
+
return librosa.load(str(path), sr=hp.sample_rate)[0]
|
21 |
+
|
22 |
+
|
23 |
+
def save_wav(x, path) :
|
24 |
+
sf.write(path, x.astype(np.float32), hp.sample_rate)
|
25 |
+
|
26 |
+
|
27 |
+
def split_signal(x) :
|
28 |
+
unsigned = x + 2**15
|
29 |
+
coarse = unsigned // 256
|
30 |
+
fine = unsigned % 256
|
31 |
+
return coarse, fine
|
32 |
+
|
33 |
+
|
34 |
+
def combine_signal(coarse, fine) :
|
35 |
+
return coarse * 256 + fine - 2**15
|
36 |
+
|
37 |
+
|
38 |
+
def encode_16bits(x) :
|
39 |
+
return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
|
40 |
+
|
41 |
+
|
42 |
+
mel_basis = None
|
43 |
+
|
44 |
+
|
45 |
+
def linear_to_mel(spectrogram):
|
46 |
+
global mel_basis
|
47 |
+
if mel_basis is None:
|
48 |
+
mel_basis = build_mel_basis()
|
49 |
+
return np.dot(mel_basis, spectrogram)
|
50 |
+
|
51 |
+
|
52 |
+
def build_mel_basis():
|
53 |
+
return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
|
54 |
+
|
55 |
+
|
56 |
+
def normalize(S):
|
57 |
+
return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1)
|
58 |
+
|
59 |
+
|
60 |
+
def denormalize(S):
|
61 |
+
return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db
|
62 |
+
|
63 |
+
|
64 |
+
def amp_to_db(x):
|
65 |
+
return 20 * np.log10(np.maximum(1e-5, x))
|
66 |
+
|
67 |
+
|
68 |
+
def db_to_amp(x):
|
69 |
+
return np.power(10.0, x * 0.05)
|
70 |
+
|
71 |
+
|
72 |
+
def spectrogram(y):
|
73 |
+
D = stft(y)
|
74 |
+
S = amp_to_db(np.abs(D)) - hp.ref_level_db
|
75 |
+
return normalize(S)
|
76 |
+
|
77 |
+
|
78 |
+
def melspectrogram(y):
|
79 |
+
D = stft(y)
|
80 |
+
S = amp_to_db(linear_to_mel(np.abs(D)))
|
81 |
+
return normalize(S)
|
82 |
+
|
83 |
+
|
84 |
+
def stft(y):
|
85 |
+
return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length)
|
86 |
+
|
87 |
+
|
88 |
+
def pre_emphasis(x):
|
89 |
+
return lfilter([1, -hp.preemphasis], [1], x)
|
90 |
+
|
91 |
+
|
92 |
+
def de_emphasis(x):
|
93 |
+
return lfilter([1], [1, -hp.preemphasis], x)
|
94 |
+
|
95 |
+
|
96 |
+
def encode_mu_law(x, mu) :
|
97 |
+
mu = mu - 1
|
98 |
+
fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
|
99 |
+
return np.floor((fx + 1) / 2 * mu + 0.5)
|
100 |
+
|
101 |
+
|
102 |
+
def decode_mu_law(y, mu, from_labels=True) :
|
103 |
+
if from_labels:
|
104 |
+
y = label_2_float(y, math.log2(mu))
|
105 |
+
mu = mu - 1
|
106 |
+
x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1)
|
107 |
+
return x
|
108 |
+
|
cleaners.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Cleaners are transformations that run over the input text at both training and eval time.
|
3 |
+
|
4 |
+
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
5 |
+
hyperparameter. Some cleaners are English-specific. You"ll typically want to use:
|
6 |
+
1. "english_cleaners" for English text
|
7 |
+
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
8 |
+
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
9 |
+
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
10 |
+
the symbols in symbols.py to match your data).
|
11 |
+
"""
|
12 |
+
|
13 |
+
import re
|
14 |
+
from unidecode import unidecode
|
15 |
+
from .numbers import normalize_numbers
|
16 |
+
from .korean import tokenize as ko_tokenize
|
17 |
+
|
18 |
+
# Regular expression matching whitespace:
|
19 |
+
_whitespace_re = re.compile(r"\s+")
|
20 |
+
|
21 |
+
|
22 |
+
def korean_cleaners(text):
|
23 |
+
'''Pipeline for Korean text, including number and abbreviation expansion.'''
|
24 |
+
text = ko_tokenize(text) # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~']
|
25 |
+
return text
|
26 |
+
|
27 |
+
|
28 |
+
# List of (regular expression, replacement) pairs for abbreviations:
|
29 |
+
_abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [
|
30 |
+
("mrs", "misess"),
|
31 |
+
("mr", "mister"),
|
32 |
+
("dr", "doctor"),
|
33 |
+
("st", "saint"),
|
34 |
+
("co", "company"),
|
35 |
+
("jr", "junior"),
|
36 |
+
("maj", "major"),
|
37 |
+
("gen", "general"),
|
38 |
+
("drs", "doctors"),
|
39 |
+
("rev", "reverend"),
|
40 |
+
("lt", "lieutenant"),
|
41 |
+
("hon", "honorable"),
|
42 |
+
("sgt", "sergeant"),
|
43 |
+
("capt", "captain"),
|
44 |
+
("esq", "esquire"),
|
45 |
+
("ltd", "limited"),
|
46 |
+
("col", "colonel"),
|
47 |
+
("ft", "fort"),
|
48 |
+
]]
|
49 |
+
|
50 |
+
|
51 |
+
def expand_abbreviations(text):
|
52 |
+
for regex, replacement in _abbreviations:
|
53 |
+
text = re.sub(regex, replacement, text)
|
54 |
+
return text
|
55 |
+
|
56 |
+
|
57 |
+
def expand_numbers(text):
|
58 |
+
return normalize_numbers(text)
|
59 |
+
|
60 |
+
|
61 |
+
def lowercase(text):
|
62 |
+
"""lowercase input tokens."""
|
63 |
+
return text.lower()
|
64 |
+
|
65 |
+
|
66 |
+
def collapse_whitespace(text):
|
67 |
+
return re.sub(_whitespace_re, " ", text)
|
68 |
+
|
69 |
+
|
70 |
+
def convert_to_ascii(text):
|
71 |
+
return unidecode(text)
|
72 |
+
|
73 |
+
|
74 |
+
def basic_cleaners(text):
|
75 |
+
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
|
76 |
+
text = lowercase(text)
|
77 |
+
text = collapse_whitespace(text)
|
78 |
+
return text
|
79 |
+
|
80 |
+
|
81 |
+
def transliteration_cleaners(text):
|
82 |
+
"""Pipeline for non-English text that transliterates to ASCII."""
|
83 |
+
text = convert_to_ascii(text)
|
84 |
+
text = lowercase(text)
|
85 |
+
text = collapse_whitespace(text)
|
86 |
+
return text
|
87 |
+
|
88 |
+
|
89 |
+
def english_cleaners(text):
|
90 |
+
"""Pipeline for English text, including number and abbreviation expansion."""
|
91 |
+
text = convert_to_ascii(text)
|
92 |
+
text = lowercase(text)
|
93 |
+
text = expand_numbers(text)
|
94 |
+
text = expand_abbreviations(text)
|
95 |
+
text = collapse_whitespace(text)
|
96 |
+
return text
|
config.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
librispeech_datasets = {
|
2 |
+
"train": {
|
3 |
+
"clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
|
4 |
+
"other": ["LibriSpeech/train-other-500"]
|
5 |
+
},
|
6 |
+
"test": {
|
7 |
+
"clean": ["LibriSpeech/test-clean"],
|
8 |
+
"other": ["LibriSpeech/test-other"]
|
9 |
+
},
|
10 |
+
"dev": {
|
11 |
+
"clean": ["LibriSpeech/dev-clean"],
|
12 |
+
"other": ["LibriSpeech/dev-other"]
|
13 |
+
},
|
14 |
+
}
|
15 |
+
libritts_datasets = {
|
16 |
+
"train": {
|
17 |
+
"clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
|
18 |
+
"other": ["LibriTTS/train-other-500"]
|
19 |
+
},
|
20 |
+
"test": {
|
21 |
+
"clean": ["LibriTTS/test-clean"],
|
22 |
+
"other": ["LibriTTS/test-other"]
|
23 |
+
},
|
24 |
+
"dev": {
|
25 |
+
"clean": ["LibriTTS/dev-clean"],
|
26 |
+
"other": ["LibriTTS/dev-other"]
|
27 |
+
},
|
28 |
+
}
|
29 |
+
voxceleb_datasets = {
|
30 |
+
"voxceleb1" : {
|
31 |
+
"train": ["VoxCeleb1/wav"],
|
32 |
+
"test": ["VoxCeleb1/test_wav"]
|
33 |
+
},
|
34 |
+
"voxceleb2" : {
|
35 |
+
"train": ["VoxCeleb2/dev/aac"],
|
36 |
+
"test": ["VoxCeleb2/test_wav"]
|
37 |
+
}
|
38 |
+
}
|
39 |
+
|
40 |
+
other_datasets = [
|
41 |
+
"LJSpeech-1.1",
|
42 |
+
"VCTK-Corpus/wav48",
|
43 |
+
]
|
44 |
+
|
45 |
+
anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
|
deepmind_version.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from utils.display import *
|
5 |
+
from utils.dsp import *
|
6 |
+
|
7 |
+
|
8 |
+
class WaveRNN(nn.Module) :
|
9 |
+
def __init__(self, hidden_size=896, quantisation=256) :
|
10 |
+
super(WaveRNN, self).__init__()
|
11 |
+
|
12 |
+
self.hidden_size = hidden_size
|
13 |
+
self.split_size = hidden_size // 2
|
14 |
+
|
15 |
+
# The main matmul
|
16 |
+
self.R = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
|
17 |
+
|
18 |
+
# Output fc layers
|
19 |
+
self.O1 = nn.Linear(self.split_size, self.split_size)
|
20 |
+
self.O2 = nn.Linear(self.split_size, quantisation)
|
21 |
+
self.O3 = nn.Linear(self.split_size, self.split_size)
|
22 |
+
self.O4 = nn.Linear(self.split_size, quantisation)
|
23 |
+
|
24 |
+
# Input fc layers
|
25 |
+
self.I_coarse = nn.Linear(2, 3 * self.split_size, bias=False)
|
26 |
+
self.I_fine = nn.Linear(3, 3 * self.split_size, bias=False)
|
27 |
+
|
28 |
+
# biases for the gates
|
29 |
+
self.bias_u = nn.Parameter(torch.zeros(self.hidden_size))
|
30 |
+
self.bias_r = nn.Parameter(torch.zeros(self.hidden_size))
|
31 |
+
self.bias_e = nn.Parameter(torch.zeros(self.hidden_size))
|
32 |
+
|
33 |
+
# display num params
|
34 |
+
self.num_params()
|
35 |
+
|
36 |
+
|
37 |
+
def forward(self, prev_y, prev_hidden, current_coarse) :
|
38 |
+
|
39 |
+
# Main matmul - the projection is split 3 ways
|
40 |
+
R_hidden = self.R(prev_hidden)
|
41 |
+
R_u, R_r, R_e, = torch.split(R_hidden, self.hidden_size, dim=1)
|
42 |
+
|
43 |
+
# Project the prev input
|
44 |
+
coarse_input_proj = self.I_coarse(prev_y)
|
45 |
+
I_coarse_u, I_coarse_r, I_coarse_e = \
|
46 |
+
torch.split(coarse_input_proj, self.split_size, dim=1)
|
47 |
+
|
48 |
+
# Project the prev input and current coarse sample
|
49 |
+
fine_input = torch.cat([prev_y, current_coarse], dim=1)
|
50 |
+
fine_input_proj = self.I_fine(fine_input)
|
51 |
+
I_fine_u, I_fine_r, I_fine_e = \
|
52 |
+
torch.split(fine_input_proj, self.split_size, dim=1)
|
53 |
+
|
54 |
+
# concatenate for the gates
|
55 |
+
I_u = torch.cat([I_coarse_u, I_fine_u], dim=1)
|
56 |
+
I_r = torch.cat([I_coarse_r, I_fine_r], dim=1)
|
57 |
+
I_e = torch.cat([I_coarse_e, I_fine_e], dim=1)
|
58 |
+
|
59 |
+
# Compute all gates for coarse and fine
|
60 |
+
u = F.sigmoid(R_u + I_u + self.bias_u)
|
61 |
+
r = F.sigmoid(R_r + I_r + self.bias_r)
|
62 |
+
e = F.tanh(r * R_e + I_e + self.bias_e)
|
63 |
+
hidden = u * prev_hidden + (1. - u) * e
|
64 |
+
|
65 |
+
# Split the hidden state
|
66 |
+
hidden_coarse, hidden_fine = torch.split(hidden, self.split_size, dim=1)
|
67 |
+
|
68 |
+
# Compute outputs
|
69 |
+
out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
|
70 |
+
out_fine = self.O4(F.relu(self.O3(hidden_fine)))
|
71 |
+
|
72 |
+
return out_coarse, out_fine, hidden
|
73 |
+
|
74 |
+
|
75 |
+
def generate(self, seq_len):
|
76 |
+
with torch.no_grad():
|
77 |
+
# First split up the biases for the gates
|
78 |
+
b_coarse_u, b_fine_u = torch.split(self.bias_u, self.split_size)
|
79 |
+
b_coarse_r, b_fine_r = torch.split(self.bias_r, self.split_size)
|
80 |
+
b_coarse_e, b_fine_e = torch.split(self.bias_e, self.split_size)
|
81 |
+
|
82 |
+
# Lists for the two output seqs
|
83 |
+
c_outputs, f_outputs = [], []
|
84 |
+
|
85 |
+
# Some initial inputs
|
86 |
+
out_coarse = torch.LongTensor([0]).cuda()
|
87 |
+
out_fine = torch.LongTensor([0]).cuda()
|
88 |
+
|
89 |
+
# We'll meed a hidden state
|
90 |
+
hidden = self.init_hidden()
|
91 |
+
|
92 |
+
# Need a clock for display
|
93 |
+
start = time.time()
|
94 |
+
|
95 |
+
# Loop for generation
|
96 |
+
for i in range(seq_len) :
|
97 |
+
|
98 |
+
# Split into two hidden states
|
99 |
+
hidden_coarse, hidden_fine = \
|
100 |
+
torch.split(hidden, self.split_size, dim=1)
|
101 |
+
|
102 |
+
# Scale and concat previous predictions
|
103 |
+
out_coarse = out_coarse.unsqueeze(0).float() / 127.5 - 1.
|
104 |
+
out_fine = out_fine.unsqueeze(0).float() / 127.5 - 1.
|
105 |
+
prev_outputs = torch.cat([out_coarse, out_fine], dim=1)
|
106 |
+
|
107 |
+
# Project input
|
108 |
+
coarse_input_proj = self.I_coarse(prev_outputs)
|
109 |
+
I_coarse_u, I_coarse_r, I_coarse_e = \
|
110 |
+
torch.split(coarse_input_proj, self.split_size, dim=1)
|
111 |
+
|
112 |
+
# Project hidden state and split 6 ways
|
113 |
+
R_hidden = self.R(hidden)
|
114 |
+
R_coarse_u , R_fine_u, \
|
115 |
+
R_coarse_r, R_fine_r, \
|
116 |
+
R_coarse_e, R_fine_e = torch.split(R_hidden, self.split_size, dim=1)
|
117 |
+
|
118 |
+
# Compute the coarse gates
|
119 |
+
u = F.sigmoid(R_coarse_u + I_coarse_u + b_coarse_u)
|
120 |
+
r = F.sigmoid(R_coarse_r + I_coarse_r + b_coarse_r)
|
121 |
+
e = F.tanh(r * R_coarse_e + I_coarse_e + b_coarse_e)
|
122 |
+
hidden_coarse = u * hidden_coarse + (1. - u) * e
|
123 |
+
|
124 |
+
# Compute the coarse output
|
125 |
+
out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
|
126 |
+
posterior = F.softmax(out_coarse, dim=1)
|
127 |
+
distrib = torch.distributions.Categorical(posterior)
|
128 |
+
out_coarse = distrib.sample()
|
129 |
+
c_outputs.append(out_coarse)
|
130 |
+
|
131 |
+
# Project the [prev outputs and predicted coarse sample]
|
132 |
+
coarse_pred = out_coarse.float() / 127.5 - 1.
|
133 |
+
fine_input = torch.cat([prev_outputs, coarse_pred.unsqueeze(0)], dim=1)
|
134 |
+
fine_input_proj = self.I_fine(fine_input)
|
135 |
+
I_fine_u, I_fine_r, I_fine_e = \
|
136 |
+
torch.split(fine_input_proj, self.split_size, dim=1)
|
137 |
+
|
138 |
+
# Compute the fine gates
|
139 |
+
u = F.sigmoid(R_fine_u + I_fine_u + b_fine_u)
|
140 |
+
r = F.sigmoid(R_fine_r + I_fine_r + b_fine_r)
|
141 |
+
e = F.tanh(r * R_fine_e + I_fine_e + b_fine_e)
|
142 |
+
hidden_fine = u * hidden_fine + (1. - u) * e
|
143 |
+
|
144 |
+
# Compute the fine output
|
145 |
+
out_fine = self.O4(F.relu(self.O3(hidden_fine)))
|
146 |
+
posterior = F.softmax(out_fine, dim=1)
|
147 |
+
distrib = torch.distributions.Categorical(posterior)
|
148 |
+
out_fine = distrib.sample()
|
149 |
+
f_outputs.append(out_fine)
|
150 |
+
|
151 |
+
# Put the hidden state back together
|
152 |
+
hidden = torch.cat([hidden_coarse, hidden_fine], dim=1)
|
153 |
+
|
154 |
+
# Display progress
|
155 |
+
speed = (i + 1) / (time.time() - start)
|
156 |
+
stream('Gen: %i/%i -- Speed: %i', (i + 1, seq_len, speed))
|
157 |
+
|
158 |
+
coarse = torch.stack(c_outputs).squeeze(1).cpu().data.numpy()
|
159 |
+
fine = torch.stack(f_outputs).squeeze(1).cpu().data.numpy()
|
160 |
+
output = combine_signal(coarse, fine)
|
161 |
+
|
162 |
+
return output, coarse, fine
|
163 |
+
|
164 |
+
def init_hidden(self, batch_size=1) :
|
165 |
+
return torch.zeros(batch_size, self.hidden_size).cuda()
|
166 |
+
|
167 |
+
def num_params(self) :
|
168 |
+
parameters = filter(lambda p: p.requires_grad, self.parameters())
|
169 |
+
parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
|
170 |
+
print('Trainable Parameters: %.3f million' % parameters)
|
demo_cli.py
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from encoder.params_model import model_embedding_size as speaker_embedding_size
|
2 |
+
from utils.argutils import print_args
|
3 |
+
from utils.modelutils import check_model_paths
|
4 |
+
from synthesizer.inference import Synthesizer
|
5 |
+
from encoder import inference as encoder
|
6 |
+
from vocoder import inference as vocoder
|
7 |
+
from pathlib import Path
|
8 |
+
import numpy as np
|
9 |
+
import soundfile as sf
|
10 |
+
import librosa
|
11 |
+
import argparse
|
12 |
+
import torch
|
13 |
+
import sys
|
14 |
+
import os
|
15 |
+
from audioread.exceptions import NoBackendError
|
16 |
+
|
17 |
+
if __name__ == '__main__':
|
18 |
+
## Info & args
|
19 |
+
parser = argparse.ArgumentParser(
|
20 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
21 |
+
)
|
22 |
+
parser.add_argument("-e", "--enc_model_fpath", type=Path,
|
23 |
+
default="encoder/saved_models/pretrained.pt",
|
24 |
+
help="Path to a saved encoder")
|
25 |
+
parser.add_argument("-s", "--syn_model_fpath", type=Path,
|
26 |
+
default="synthesizer/saved_models/pretrained/pretrained.pt",
|
27 |
+
help="Path to a saved synthesizer")
|
28 |
+
parser.add_argument("-v", "--voc_model_fpath", type=Path,
|
29 |
+
default="vocoder/saved_models/pretrained/pretrained.pt",
|
30 |
+
help="Path to a saved vocoder")
|
31 |
+
parser.add_argument("--cpu", action="store_true", help=\
|
32 |
+
"If True, processing is done on CPU, even when a GPU is available.")
|
33 |
+
parser.add_argument("--no_sound", action="store_true", help=\
|
34 |
+
"If True, audio won't be played.")
|
35 |
+
parser.add_argument("--seed", type=int, default=None, help=\
|
36 |
+
"Optional random number seed value to make toolbox deterministic.")
|
37 |
+
parser.add_argument("--no_mp3_support", action="store_true", help=\
|
38 |
+
"If True, disallows loading mp3 files to prevent audioread errors when ffmpeg is not installed.")
|
39 |
+
args = parser.parse_args()
|
40 |
+
print_args(args, parser)
|
41 |
+
if not args.no_sound:
|
42 |
+
import sounddevice as sd
|
43 |
+
|
44 |
+
if args.cpu:
|
45 |
+
# Hide GPUs from Pytorch to force CPU processing
|
46 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
47 |
+
|
48 |
+
if not args.no_mp3_support:
|
49 |
+
try:
|
50 |
+
librosa.load("samples/1320_00000.mp3")
|
51 |
+
except NoBackendError:
|
52 |
+
print("Librosa will be unable to open mp3 files if additional software is not installed.\n"
|
53 |
+
"Please install ffmpeg or add the '--no_mp3_support' option to proceed without support for mp3 files.")
|
54 |
+
exit(-1)
|
55 |
+
|
56 |
+
print("Running a test of your configuration...\n")
|
57 |
+
|
58 |
+
if torch.cuda.is_available():
|
59 |
+
device_id = torch.cuda.current_device()
|
60 |
+
gpu_properties = torch.cuda.get_device_properties(device_id)
|
61 |
+
## Print some environment information (for debugging purposes)
|
62 |
+
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
|
63 |
+
"%.1fGb total memory.\n" %
|
64 |
+
(torch.cuda.device_count(),
|
65 |
+
device_id,
|
66 |
+
gpu_properties.name,
|
67 |
+
gpu_properties.major,
|
68 |
+
gpu_properties.minor,
|
69 |
+
gpu_properties.total_memory / 1e9))
|
70 |
+
else:
|
71 |
+
print("Using CPU for inference.\n")
|
72 |
+
|
73 |
+
## Remind the user to download pretrained models if needed
|
74 |
+
check_model_paths(encoder_path=args.enc_model_fpath,
|
75 |
+
synthesizer_path=args.syn_model_fpath,
|
76 |
+
vocoder_path=args.voc_model_fpath)
|
77 |
+
|
78 |
+
## Load the models one by one.
|
79 |
+
print("Preparing the encoder, the synthesizer and the vocoder...")
|
80 |
+
encoder.load_model(args.enc_model_fpath)
|
81 |
+
synthesizer = Synthesizer(args.syn_model_fpath)
|
82 |
+
vocoder.load_model(args.voc_model_fpath)
|
83 |
+
|
84 |
+
|
85 |
+
## Run a test
|
86 |
+
print("Testing your configuration with small inputs.")
|
87 |
+
# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
|
88 |
+
# sampling rate, which may differ.
|
89 |
+
# If you're unfamiliar with digital audio, know that it is encoded as an array of floats
|
90 |
+
# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
|
91 |
+
# The sampling rate is the number of values (samples) recorded per second, it is set to
|
92 |
+
# 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
|
93 |
+
# to an audio of 1 second.
|
94 |
+
print("\tTesting the encoder...")
|
95 |
+
encoder.embed_utterance(np.zeros(encoder.sampling_rate))
|
96 |
+
|
97 |
+
# Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
|
98 |
+
# returns, but here we're going to make one ourselves just for the sake of showing that it's
|
99 |
+
# possible.
|
100 |
+
embed = np.random.rand(speaker_embedding_size)
|
101 |
+
# Embeddings are L2-normalized (this isn't important here, but if you want to make your own
|
102 |
+
# embeddings it will be).
|
103 |
+
embed /= np.linalg.norm(embed)
|
104 |
+
# The synthesizer can handle multiple inputs with batching. Let's create another embedding to
|
105 |
+
# illustrate that
|
106 |
+
embeds = [embed, np.zeros(speaker_embedding_size)]
|
107 |
+
texts = ["test 1", "test 2"]
|
108 |
+
print("\tTesting the synthesizer... (loading the model will output a lot of text)")
|
109 |
+
mels = synthesizer.synthesize_spectrograms(texts, embeds)
|
110 |
+
|
111 |
+
# The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
|
112 |
+
# can concatenate the mel spectrograms to a single one.
|
113 |
+
mel = np.concatenate(mels, axis=1)
|
114 |
+
# The vocoder can take a callback function to display the generation. More on that later. For
|
115 |
+
# now we'll simply hide it like this:
|
116 |
+
no_action = lambda *args: None
|
117 |
+
print("\tTesting the vocoder...")
|
118 |
+
# For the sake of making this test short, we'll pass a short target length. The target length
|
119 |
+
# is the length of the wav segments that are processed in parallel. E.g. for audio sampled
|
120 |
+
# at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
|
121 |
+
# 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
|
122 |
+
# that has a detrimental effect on the quality of the audio. The default parameters are
|
123 |
+
# recommended in general.
|
124 |
+
vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
|
125 |
+
|
126 |
+
print("All test passed! You can now synthesize speech.\n\n")
|
127 |
+
|
128 |
+
|
129 |
+
## Interactive speech generation
|
130 |
+
print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
|
131 |
+
"show how you can interface this project easily with your own. See the source code for "
|
132 |
+
"an explanation of what is happening.\n")
|
133 |
+
|
134 |
+
print("Interactive generation loop")
|
135 |
+
num_generated = 0
|
136 |
+
while True:
|
137 |
+
try:
|
138 |
+
# Get the reference audio filepath
|
139 |
+
message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
|
140 |
+
"wav, m4a, flac, ...):\n"
|
141 |
+
in_fpath = Path(input(message).replace("\"", "").replace("\'", ""))
|
142 |
+
|
143 |
+
if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support:
|
144 |
+
print("Can't Use mp3 files please try again:")
|
145 |
+
continue
|
146 |
+
## Computing the embedding
|
147 |
+
# First, we load the wav using the function that the speaker encoder provides. This is
|
148 |
+
# important: there is preprocessing that must be applied.
|
149 |
+
|
150 |
+
# The following two methods are equivalent:
|
151 |
+
# - Directly load from the filepath:
|
152 |
+
preprocessed_wav = encoder.preprocess_wav(in_fpath)
|
153 |
+
# - If the wav is already loaded:
|
154 |
+
original_wav, sampling_rate = librosa.load(str(in_fpath))
|
155 |
+
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
|
156 |
+
print("Loaded file succesfully")
|
157 |
+
|
158 |
+
# Then we derive the embedding. There are many functions and parameters that the
|
159 |
+
# speaker encoder interfaces. These are mostly for in-depth research. You will typically
|
160 |
+
# only use this function (with its default parameters):
|
161 |
+
embed = encoder.embed_utterance(preprocessed_wav)
|
162 |
+
print("Created the embedding")
|
163 |
+
|
164 |
+
|
165 |
+
## Generating the spectrogram
|
166 |
+
text = input("Write a sentence (+-20 words) to be synthesized:\n")
|
167 |
+
|
168 |
+
# If seed is specified, reset torch seed and force synthesizer reload
|
169 |
+
if args.seed is not None:
|
170 |
+
torch.manual_seed(args.seed)
|
171 |
+
synthesizer = Synthesizer(args.syn_model_fpath)
|
172 |
+
|
173 |
+
# The synthesizer works in batch, so you need to put your data in a list or numpy array
|
174 |
+
texts = [text]
|
175 |
+
embeds = [embed]
|
176 |
+
# If you know what the attention layer alignments are, you can retrieve them here by
|
177 |
+
# passing return_alignments=True
|
178 |
+
specs = synthesizer.synthesize_spectrograms(texts, embeds)
|
179 |
+
spec = specs[0]
|
180 |
+
print("Created the mel spectrogram")
|
181 |
+
|
182 |
+
|
183 |
+
## Generating the waveform
|
184 |
+
print("Synthesizing the waveform:")
|
185 |
+
|
186 |
+
# If seed is specified, reset torch seed and reload vocoder
|
187 |
+
if args.seed is not None:
|
188 |
+
torch.manual_seed(args.seed)
|
189 |
+
vocoder.load_model(args.voc_model_fpath)
|
190 |
+
|
191 |
+
# Synthesizing the waveform is fairly straightforward. Remember that the longer the
|
192 |
+
# spectrogram, the more time-efficient the vocoder.
|
193 |
+
generated_wav = vocoder.infer_waveform(spec)
|
194 |
+
|
195 |
+
|
196 |
+
## Post-generation
|
197 |
+
# There's a bug with sounddevice that makes the audio cut one second earlier, so we
|
198 |
+
# pad it.
|
199 |
+
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
|
200 |
+
|
201 |
+
# Trim excess silences to compensate for gaps in spectrograms (issue #53)
|
202 |
+
generated_wav = encoder.preprocess_wav(generated_wav)
|
203 |
+
|
204 |
+
# Play the audio (non-blocking)
|
205 |
+
if not args.no_sound:
|
206 |
+
try:
|
207 |
+
sd.stop()
|
208 |
+
sd.play(generated_wav, synthesizer.sample_rate)
|
209 |
+
except sd.PortAudioError as e:
|
210 |
+
print("\nCaught exception: %s" % repr(e))
|
211 |
+
print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
|
212 |
+
except:
|
213 |
+
raise
|
214 |
+
|
215 |
+
# Save it on the disk
|
216 |
+
filename = "demo_output_%02d.wav" % num_generated
|
217 |
+
print(generated_wav.dtype)
|
218 |
+
sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
|
219 |
+
num_generated += 1
|
220 |
+
print("\nSaved output as %s\n\n" % filename)
|
221 |
+
|
222 |
+
|
223 |
+
except Exception as e:
|
224 |
+
print("Caught exception: %s" % repr(e))
|
225 |
+
print("Restarting\n")
|
demo_toolbox.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from toolbox import Toolbox
|
3 |
+
from utils.argutils import print_args
|
4 |
+
from utils.modelutils import check_model_paths
|
5 |
+
import argparse
|
6 |
+
import os
|
7 |
+
|
8 |
+
|
9 |
+
if __name__ == '__main__':
|
10 |
+
parser = argparse.ArgumentParser(
|
11 |
+
description="Runs the toolbox",
|
12 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
13 |
+
)
|
14 |
+
|
15 |
+
parser.add_argument("-d", "--datasets_root", type=Path, help= \
|
16 |
+
"Path to the directory containing your datasets. See toolbox/__init__.py for a list of "
|
17 |
+
"supported datasets.", default=None)
|
18 |
+
parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models",
|
19 |
+
help="Directory containing saved encoder models")
|
20 |
+
parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models",
|
21 |
+
help="Directory containing saved synthesizer models")
|
22 |
+
parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models",
|
23 |
+
help="Directory containing saved vocoder models")
|
24 |
+
parser.add_argument("--cpu", action="store_true", help=\
|
25 |
+
"If True, processing is done on CPU, even when a GPU is available.")
|
26 |
+
parser.add_argument("--seed", type=int, default=None, help=\
|
27 |
+
"Optional random number seed value to make toolbox deterministic.")
|
28 |
+
parser.add_argument("--no_mp3_support", action="store_true", help=\
|
29 |
+
"If True, no mp3 files are allowed.")
|
30 |
+
args = parser.parse_args()
|
31 |
+
print_args(args, parser)
|
32 |
+
|
33 |
+
if args.cpu:
|
34 |
+
# Hide GPUs from Pytorch to force CPU processing
|
35 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
36 |
+
del args.cpu
|
37 |
+
|
38 |
+
## Remind the user to download pretrained models if needed
|
39 |
+
check_model_paths(encoder_path=args.enc_models_dir, synthesizer_path=args.syn_models_dir,
|
40 |
+
vocoder_path=args.voc_models_dir)
|
41 |
+
|
42 |
+
# Launch the toolbox
|
43 |
+
Toolbox(**vars(args))
|
display.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
import time
|
3 |
+
import numpy as np
|
4 |
+
import sys
|
5 |
+
|
6 |
+
|
7 |
+
def progbar(i, n, size=16):
|
8 |
+
done = (i * size) // n
|
9 |
+
bar = ''
|
10 |
+
for i in range(size):
|
11 |
+
bar += '█' if i <= done else '░'
|
12 |
+
return bar
|
13 |
+
|
14 |
+
|
15 |
+
def stream(message) :
|
16 |
+
try:
|
17 |
+
sys.stdout.write("\r{%s}" % message)
|
18 |
+
except:
|
19 |
+
#Remove non-ASCII characters from message
|
20 |
+
message = ''.join(i for i in message if ord(i)<128)
|
21 |
+
sys.stdout.write("\r{%s}" % message)
|
22 |
+
|
23 |
+
|
24 |
+
def simple_table(item_tuples) :
|
25 |
+
|
26 |
+
border_pattern = '+---------------------------------------'
|
27 |
+
whitespace = ' '
|
28 |
+
|
29 |
+
headings, cells, = [], []
|
30 |
+
|
31 |
+
for item in item_tuples :
|
32 |
+
|
33 |
+
heading, cell = str(item[0]), str(item[1])
|
34 |
+
|
35 |
+
pad_head = True if len(heading) < len(cell) else False
|
36 |
+
|
37 |
+
pad = abs(len(heading) - len(cell))
|
38 |
+
pad = whitespace[:pad]
|
39 |
+
|
40 |
+
pad_left = pad[:len(pad)//2]
|
41 |
+
pad_right = pad[len(pad)//2:]
|
42 |
+
|
43 |
+
if pad_head :
|
44 |
+
heading = pad_left + heading + pad_right
|
45 |
+
else :
|
46 |
+
cell = pad_left + cell + pad_right
|
47 |
+
|
48 |
+
headings += [heading]
|
49 |
+
cells += [cell]
|
50 |
+
|
51 |
+
border, head, body = '', '', ''
|
52 |
+
|
53 |
+
for i in range(len(item_tuples)) :
|
54 |
+
|
55 |
+
temp_head = f'| {headings[i]} '
|
56 |
+
temp_body = f'| {cells[i]} '
|
57 |
+
|
58 |
+
border += border_pattern[:len(temp_head)]
|
59 |
+
head += temp_head
|
60 |
+
body += temp_body
|
61 |
+
|
62 |
+
if i == len(item_tuples) - 1 :
|
63 |
+
head += '|'
|
64 |
+
body += '|'
|
65 |
+
border += '+'
|
66 |
+
|
67 |
+
print(border)
|
68 |
+
print(head)
|
69 |
+
print(border)
|
70 |
+
print(body)
|
71 |
+
print(border)
|
72 |
+
print(' ')
|
73 |
+
|
74 |
+
|
75 |
+
def time_since(started) :
|
76 |
+
elapsed = time.time() - started
|
77 |
+
m = int(elapsed // 60)
|
78 |
+
s = int(elapsed % 60)
|
79 |
+
if m >= 60 :
|
80 |
+
h = int(m // 60)
|
81 |
+
m = m % 60
|
82 |
+
return f'{h}h {m}m {s}s'
|
83 |
+
else :
|
84 |
+
return f'{m}m {s}s'
|
85 |
+
|
86 |
+
|
87 |
+
def save_attention(attn, path) :
|
88 |
+
fig = plt.figure(figsize=(12, 6))
|
89 |
+
plt.imshow(attn.T, interpolation='nearest', aspect='auto')
|
90 |
+
fig.savefig(f'{path}.png', bbox_inches='tight')
|
91 |
+
plt.close(fig)
|
92 |
+
|
93 |
+
|
94 |
+
def save_spectrogram(M, path, length=None) :
|
95 |
+
M = np.flip(M, axis=0)
|
96 |
+
if length : M = M[:, :length]
|
97 |
+
fig = plt.figure(figsize=(12, 6))
|
98 |
+
plt.imshow(M, interpolation='nearest', aspect='auto')
|
99 |
+
fig.savefig(f'{path}.png', bbox_inches='tight')
|
100 |
+
plt.close(fig)
|
101 |
+
|
102 |
+
|
103 |
+
def plot(array) :
|
104 |
+
fig = plt.figure(figsize=(30, 5))
|
105 |
+
ax = fig.add_subplot(111)
|
106 |
+
ax.xaxis.label.set_color('grey')
|
107 |
+
ax.yaxis.label.set_color('grey')
|
108 |
+
ax.xaxis.label.set_fontsize(23)
|
109 |
+
ax.yaxis.label.set_fontsize(23)
|
110 |
+
ax.tick_params(axis='x', colors='grey', labelsize=23)
|
111 |
+
ax.tick_params(axis='y', colors='grey', labelsize=23)
|
112 |
+
plt.plot(array)
|
113 |
+
|
114 |
+
|
115 |
+
def plot_spec(M) :
|
116 |
+
M = np.flip(M, axis=0)
|
117 |
+
plt.figure(figsize=(18,4))
|
118 |
+
plt.imshow(M, interpolation='nearest', aspect='auto')
|
119 |
+
plt.show()
|
120 |
+
|
distribution.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
|
6 |
+
def log_sum_exp(x):
|
7 |
+
""" numerically stable log_sum_exp implementation that prevents overflow """
|
8 |
+
# TF ordering
|
9 |
+
axis = len(x.size()) - 1
|
10 |
+
m, _ = torch.max(x, dim=axis)
|
11 |
+
m2, _ = torch.max(x, dim=axis, keepdim=True)
|
12 |
+
return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
|
13 |
+
|
14 |
+
|
15 |
+
# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
|
16 |
+
def discretized_mix_logistic_loss(y_hat, y, num_classes=65536,
|
17 |
+
log_scale_min=None, reduce=True):
|
18 |
+
if log_scale_min is None:
|
19 |
+
log_scale_min = float(np.log(1e-14))
|
20 |
+
y_hat = y_hat.permute(0,2,1)
|
21 |
+
assert y_hat.dim() == 3
|
22 |
+
assert y_hat.size(1) % 3 == 0
|
23 |
+
nr_mix = y_hat.size(1) // 3
|
24 |
+
|
25 |
+
# (B x T x C)
|
26 |
+
y_hat = y_hat.transpose(1, 2)
|
27 |
+
|
28 |
+
# unpack parameters. (B, T, num_mixtures) x 3
|
29 |
+
logit_probs = y_hat[:, :, :nr_mix]
|
30 |
+
means = y_hat[:, :, nr_mix:2 * nr_mix]
|
31 |
+
log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min)
|
32 |
+
|
33 |
+
# B x T x 1 -> B x T x num_mixtures
|
34 |
+
y = y.expand_as(means)
|
35 |
+
|
36 |
+
centered_y = y - means
|
37 |
+
inv_stdv = torch.exp(-log_scales)
|
38 |
+
plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
|
39 |
+
cdf_plus = torch.sigmoid(plus_in)
|
40 |
+
min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
|
41 |
+
cdf_min = torch.sigmoid(min_in)
|
42 |
+
|
43 |
+
# log probability for edge case of 0 (before scaling)
|
44 |
+
# equivalent: torch.log(F.sigmoid(plus_in))
|
45 |
+
log_cdf_plus = plus_in - F.softplus(plus_in)
|
46 |
+
|
47 |
+
# log probability for edge case of 255 (before scaling)
|
48 |
+
# equivalent: (1 - F.sigmoid(min_in)).log()
|
49 |
+
log_one_minus_cdf_min = -F.softplus(min_in)
|
50 |
+
|
51 |
+
# probability for all other cases
|
52 |
+
cdf_delta = cdf_plus - cdf_min
|
53 |
+
|
54 |
+
mid_in = inv_stdv * centered_y
|
55 |
+
# log probability in the center of the bin, to be used in extreme cases
|
56 |
+
# (not actually used in our code)
|
57 |
+
log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in)
|
58 |
+
|
59 |
+
# tf equivalent
|
60 |
+
"""
|
61 |
+
log_probs = tf.where(x < -0.999, log_cdf_plus,
|
62 |
+
tf.where(x > 0.999, log_one_minus_cdf_min,
|
63 |
+
tf.where(cdf_delta > 1e-5,
|
64 |
+
tf.log(tf.maximum(cdf_delta, 1e-12)),
|
65 |
+
log_pdf_mid - np.log(127.5))))
|
66 |
+
"""
|
67 |
+
# TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
|
68 |
+
# for num_classes=65536 case? 1e-7? not sure..
|
69 |
+
inner_inner_cond = (cdf_delta > 1e-5).float()
|
70 |
+
|
71 |
+
inner_inner_out = inner_inner_cond * \
|
72 |
+
torch.log(torch.clamp(cdf_delta, min=1e-12)) + \
|
73 |
+
(1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
|
74 |
+
inner_cond = (y > 0.999).float()
|
75 |
+
inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out
|
76 |
+
cond = (y < -0.999).float()
|
77 |
+
log_probs = cond * log_cdf_plus + (1. - cond) * inner_out
|
78 |
+
|
79 |
+
log_probs = log_probs + F.log_softmax(logit_probs, -1)
|
80 |
+
|
81 |
+
if reduce:
|
82 |
+
return -torch.mean(log_sum_exp(log_probs))
|
83 |
+
else:
|
84 |
+
return -log_sum_exp(log_probs).unsqueeze(-1)
|
85 |
+
|
86 |
+
|
87 |
+
def sample_from_discretized_mix_logistic(y, log_scale_min=None):
|
88 |
+
"""
|
89 |
+
Sample from discretized mixture of logistic distributions
|
90 |
+
Args:
|
91 |
+
y (Tensor): B x C x T
|
92 |
+
log_scale_min (float): Log scale minimum value
|
93 |
+
Returns:
|
94 |
+
Tensor: sample in range of [-1, 1].
|
95 |
+
"""
|
96 |
+
if log_scale_min is None:
|
97 |
+
log_scale_min = float(np.log(1e-14))
|
98 |
+
assert y.size(1) % 3 == 0
|
99 |
+
nr_mix = y.size(1) // 3
|
100 |
+
|
101 |
+
# B x T x C
|
102 |
+
y = y.transpose(1, 2)
|
103 |
+
logit_probs = y[:, :, :nr_mix]
|
104 |
+
|
105 |
+
# sample mixture indicator from softmax
|
106 |
+
temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
|
107 |
+
temp = logit_probs.data - torch.log(- torch.log(temp))
|
108 |
+
_, argmax = temp.max(dim=-1)
|
109 |
+
|
110 |
+
# (B, T) -> (B, T, nr_mix)
|
111 |
+
one_hot = to_one_hot(argmax, nr_mix)
|
112 |
+
# select logistic parameters
|
113 |
+
means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1)
|
114 |
+
log_scales = torch.clamp(torch.sum(
|
115 |
+
y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min)
|
116 |
+
# sample from logistic & clip to interval
|
117 |
+
# we don't actually round to the nearest 8bit value when sampling
|
118 |
+
u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5)
|
119 |
+
x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u))
|
120 |
+
|
121 |
+
x = torch.clamp(torch.clamp(x, min=-1.), max=1.)
|
122 |
+
|
123 |
+
return x
|
124 |
+
|
125 |
+
|
126 |
+
def to_one_hot(tensor, n, fill_with=1.):
|
127 |
+
# we perform one hot encore with respect to the last axis
|
128 |
+
one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_()
|
129 |
+
if tensor.is_cuda:
|
130 |
+
one_hot = one_hot.cuda()
|
131 |
+
one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with)
|
132 |
+
return one_hot
|
encoder_preprocess.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2
|
2 |
+
from utils.argutils import print_args
|
3 |
+
from pathlib import Path
|
4 |
+
import argparse
|
5 |
+
|
6 |
+
if __name__ == "__main__":
|
7 |
+
class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
|
8 |
+
pass
|
9 |
+
|
10 |
+
parser = argparse.ArgumentParser(
|
11 |
+
description="Preprocesses audio files from datasets, encodes them as mel spectrograms and "
|
12 |
+
"writes them to the disk. This will allow you to train the encoder. The "
|
13 |
+
"datasets required are at least one of VoxCeleb1, VoxCeleb2 and LibriSpeech. "
|
14 |
+
"Ideally, you should have all three. You should extract them as they are "
|
15 |
+
"after having downloaded them and put them in a same directory, e.g.:\n"
|
16 |
+
"-[datasets_root]\n"
|
17 |
+
" -LibriSpeech\n"
|
18 |
+
" -train-other-500\n"
|
19 |
+
" -VoxCeleb1\n"
|
20 |
+
" -wav\n"
|
21 |
+
" -vox1_meta.csv\n"
|
22 |
+
" -VoxCeleb2\n"
|
23 |
+
" -dev",
|
24 |
+
formatter_class=MyFormatter
|
25 |
+
)
|
26 |
+
parser.add_argument("datasets_root", type=Path, help=\
|
27 |
+
"Path to the directory containing your LibriSpeech/TTS and VoxCeleb datasets.")
|
28 |
+
parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
|
29 |
+
"Path to the output directory that will contain the mel spectrograms. If left out, "
|
30 |
+
"defaults to <datasets_root>/SV2TTS/encoder/")
|
31 |
+
parser.add_argument("-d", "--datasets", type=str,
|
32 |
+
default="librispeech_other,voxceleb1,voxceleb2", help=\
|
33 |
+
"Comma-separated list of the name of the datasets you want to preprocess. Only the train "
|
34 |
+
"set of these datasets will be used. Possible names: librispeech_other, voxceleb1, "
|
35 |
+
"voxceleb2.")
|
36 |
+
parser.add_argument("-s", "--skip_existing", action="store_true", help=\
|
37 |
+
"Whether to skip existing output files with the same name. Useful if this script was "
|
38 |
+
"interrupted.")
|
39 |
+
parser.add_argument("--no_trim", action="store_true", help=\
|
40 |
+
"Preprocess audio without trimming silences (not recommended).")
|
41 |
+
args = parser.parse_args()
|
42 |
+
|
43 |
+
# Verify webrtcvad is available
|
44 |
+
if not args.no_trim:
|
45 |
+
try:
|
46 |
+
import webrtcvad
|
47 |
+
except:
|
48 |
+
raise ModuleNotFoundError("Package 'webrtcvad' not found. This package enables "
|
49 |
+
"noise removal and is recommended. Please install and try again. If installation fails, "
|
50 |
+
"use --no_trim to disable this error message.")
|
51 |
+
del args.no_trim
|
52 |
+
|
53 |
+
# Process the arguments
|
54 |
+
args.datasets = args.datasets.split(",")
|
55 |
+
if not hasattr(args, "out_dir"):
|
56 |
+
args.out_dir = args.datasets_root.joinpath("SV2TTS", "encoder")
|
57 |
+
assert args.datasets_root.exists()
|
58 |
+
args.out_dir.mkdir(exist_ok=True, parents=True)
|
59 |
+
|
60 |
+
# Preprocess the datasets
|
61 |
+
print_args(args, parser)
|
62 |
+
preprocess_func = {
|
63 |
+
"librispeech_other": preprocess_librispeech,
|
64 |
+
"voxceleb1": preprocess_voxceleb1,
|
65 |
+
"voxceleb2": preprocess_voxceleb2,
|
66 |
+
}
|
67 |
+
args = vars(args)
|
68 |
+
for dataset in args.pop("datasets"):
|
69 |
+
print("Preprocessing %s" % dataset)
|
70 |
+
preprocess_func[dataset](**args)
|
encoder_train.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils.argutils import print_args
|
2 |
+
from encoder.train import train
|
3 |
+
from pathlib import Path
|
4 |
+
import argparse
|
5 |
+
|
6 |
+
|
7 |
+
if __name__ == "__main__":
|
8 |
+
parser = argparse.ArgumentParser(
|
9 |
+
description="Trains the speaker encoder. You must have run encoder_preprocess.py first.",
|
10 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
11 |
+
)
|
12 |
+
|
13 |
+
parser.add_argument("run_id", type=str, help= \
|
14 |
+
"Name for this model instance. If a model state from the same run ID was previously "
|
15 |
+
"saved, the training will restart from there. Pass -f to overwrite saved states and "
|
16 |
+
"restart from scratch.")
|
17 |
+
parser.add_argument("clean_data_root", type=Path, help= \
|
18 |
+
"Path to the output directory of encoder_preprocess.py. If you left the default "
|
19 |
+
"output directory when preprocessing, it should be <datasets_root>/SV2TTS/encoder/.")
|
20 |
+
parser.add_argument("-m", "--models_dir", type=Path, default="encoder/saved_models/", help=\
|
21 |
+
"Path to the output directory that will contain the saved model weights, as well as "
|
22 |
+
"backups of those weights and plots generated during training.")
|
23 |
+
parser.add_argument("-v", "--vis_every", type=int, default=10, help= \
|
24 |
+
"Number of steps between updates of the loss and the plots.")
|
25 |
+
parser.add_argument("-u", "--umap_every", type=int, default=100, help= \
|
26 |
+
"Number of steps between updates of the umap projection. Set to 0 to never update the "
|
27 |
+
"projections.")
|
28 |
+
parser.add_argument("-s", "--save_every", type=int, default=500, help= \
|
29 |
+
"Number of steps between updates of the model on the disk. Set to 0 to never save the "
|
30 |
+
"model.")
|
31 |
+
parser.add_argument("-b", "--backup_every", type=int, default=7500, help= \
|
32 |
+
"Number of steps between backups of the model. Set to 0 to never make backups of the "
|
33 |
+
"model.")
|
34 |
+
parser.add_argument("-f", "--force_restart", action="store_true", help= \
|
35 |
+
"Do not load any saved model.")
|
36 |
+
parser.add_argument("--visdom_server", type=str, default="http://localhost")
|
37 |
+
parser.add_argument("--no_visdom", action="store_true", help= \
|
38 |
+
"Disable visdom.")
|
39 |
+
args = parser.parse_args()
|
40 |
+
|
41 |
+
# Process the arguments
|
42 |
+
args.models_dir.mkdir(exist_ok=True)
|
43 |
+
|
44 |
+
# Run the training
|
45 |
+
print_args(args, parser)
|
46 |
+
train(**vars(args))
|
47 |
+
|
fatchord_version.py
ADDED
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from vocoder.distribution import sample_from_discretized_mix_logistic
|
5 |
+
from vocoder.display import *
|
6 |
+
from vocoder.audio import *
|
7 |
+
|
8 |
+
|
9 |
+
class ResBlock(nn.Module):
|
10 |
+
def __init__(self, dims):
|
11 |
+
super().__init__()
|
12 |
+
self.conv1 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
|
13 |
+
self.conv2 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
|
14 |
+
self.batch_norm1 = nn.BatchNorm1d(dims)
|
15 |
+
self.batch_norm2 = nn.BatchNorm1d(dims)
|
16 |
+
|
17 |
+
def forward(self, x):
|
18 |
+
residual = x
|
19 |
+
x = self.conv1(x)
|
20 |
+
x = self.batch_norm1(x)
|
21 |
+
x = F.relu(x)
|
22 |
+
x = self.conv2(x)
|
23 |
+
x = self.batch_norm2(x)
|
24 |
+
return x + residual
|
25 |
+
|
26 |
+
|
27 |
+
class MelResNet(nn.Module):
|
28 |
+
def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad):
|
29 |
+
super().__init__()
|
30 |
+
k_size = pad * 2 + 1
|
31 |
+
self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False)
|
32 |
+
self.batch_norm = nn.BatchNorm1d(compute_dims)
|
33 |
+
self.layers = nn.ModuleList()
|
34 |
+
for i in range(res_blocks):
|
35 |
+
self.layers.append(ResBlock(compute_dims))
|
36 |
+
self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1)
|
37 |
+
|
38 |
+
def forward(self, x):
|
39 |
+
x = self.conv_in(x)
|
40 |
+
x = self.batch_norm(x)
|
41 |
+
x = F.relu(x)
|
42 |
+
for f in self.layers: x = f(x)
|
43 |
+
x = self.conv_out(x)
|
44 |
+
return x
|
45 |
+
|
46 |
+
|
47 |
+
class Stretch2d(nn.Module):
|
48 |
+
def __init__(self, x_scale, y_scale):
|
49 |
+
super().__init__()
|
50 |
+
self.x_scale = x_scale
|
51 |
+
self.y_scale = y_scale
|
52 |
+
|
53 |
+
def forward(self, x):
|
54 |
+
b, c, h, w = x.size()
|
55 |
+
x = x.unsqueeze(-1).unsqueeze(3)
|
56 |
+
x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale)
|
57 |
+
return x.view(b, c, h * self.y_scale, w * self.x_scale)
|
58 |
+
|
59 |
+
|
60 |
+
class UpsampleNetwork(nn.Module):
|
61 |
+
def __init__(self, feat_dims, upsample_scales, compute_dims,
|
62 |
+
res_blocks, res_out_dims, pad):
|
63 |
+
super().__init__()
|
64 |
+
total_scale = np.cumproduct(upsample_scales)[-1]
|
65 |
+
self.indent = pad * total_scale
|
66 |
+
self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims, pad)
|
67 |
+
self.resnet_stretch = Stretch2d(total_scale, 1)
|
68 |
+
self.up_layers = nn.ModuleList()
|
69 |
+
for scale in upsample_scales:
|
70 |
+
k_size = (1, scale * 2 + 1)
|
71 |
+
padding = (0, scale)
|
72 |
+
stretch = Stretch2d(scale, 1)
|
73 |
+
conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False)
|
74 |
+
conv.weight.data.fill_(1. / k_size[1])
|
75 |
+
self.up_layers.append(stretch)
|
76 |
+
self.up_layers.append(conv)
|
77 |
+
|
78 |
+
def forward(self, m):
|
79 |
+
aux = self.resnet(m).unsqueeze(1)
|
80 |
+
aux = self.resnet_stretch(aux)
|
81 |
+
aux = aux.squeeze(1)
|
82 |
+
m = m.unsqueeze(1)
|
83 |
+
for f in self.up_layers: m = f(m)
|
84 |
+
m = m.squeeze(1)[:, :, self.indent:-self.indent]
|
85 |
+
return m.transpose(1, 2), aux.transpose(1, 2)
|
86 |
+
|
87 |
+
|
88 |
+
class WaveRNN(nn.Module):
|
89 |
+
def __init__(self, rnn_dims, fc_dims, bits, pad, upsample_factors,
|
90 |
+
feat_dims, compute_dims, res_out_dims, res_blocks,
|
91 |
+
hop_length, sample_rate, mode='RAW'):
|
92 |
+
super().__init__()
|
93 |
+
self.mode = mode
|
94 |
+
self.pad = pad
|
95 |
+
if self.mode == 'RAW' :
|
96 |
+
self.n_classes = 2 ** bits
|
97 |
+
elif self.mode == 'MOL' :
|
98 |
+
self.n_classes = 30
|
99 |
+
else :
|
100 |
+
RuntimeError("Unknown model mode value - ", self.mode)
|
101 |
+
|
102 |
+
self.rnn_dims = rnn_dims
|
103 |
+
self.aux_dims = res_out_dims // 4
|
104 |
+
self.hop_length = hop_length
|
105 |
+
self.sample_rate = sample_rate
|
106 |
+
|
107 |
+
self.upsample = UpsampleNetwork(feat_dims, upsample_factors, compute_dims, res_blocks, res_out_dims, pad)
|
108 |
+
self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims)
|
109 |
+
self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True)
|
110 |
+
self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True)
|
111 |
+
self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)
|
112 |
+
self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)
|
113 |
+
self.fc3 = nn.Linear(fc_dims, self.n_classes)
|
114 |
+
|
115 |
+
self.step = nn.Parameter(torch.zeros(1).long(), requires_grad=False)
|
116 |
+
self.num_params()
|
117 |
+
|
118 |
+
def forward(self, x, mels):
|
119 |
+
self.step += 1
|
120 |
+
bsize = x.size(0)
|
121 |
+
if torch.cuda.is_available():
|
122 |
+
h1 = torch.zeros(1, bsize, self.rnn_dims).cuda()
|
123 |
+
h2 = torch.zeros(1, bsize, self.rnn_dims).cuda()
|
124 |
+
else:
|
125 |
+
h1 = torch.zeros(1, bsize, self.rnn_dims).cpu()
|
126 |
+
h2 = torch.zeros(1, bsize, self.rnn_dims).cpu()
|
127 |
+
mels, aux = self.upsample(mels)
|
128 |
+
|
129 |
+
aux_idx = [self.aux_dims * i for i in range(5)]
|
130 |
+
a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
|
131 |
+
a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
|
132 |
+
a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
|
133 |
+
a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
|
134 |
+
|
135 |
+
x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2)
|
136 |
+
x = self.I(x)
|
137 |
+
res = x
|
138 |
+
x, _ = self.rnn1(x, h1)
|
139 |
+
|
140 |
+
x = x + res
|
141 |
+
res = x
|
142 |
+
x = torch.cat([x, a2], dim=2)
|
143 |
+
x, _ = self.rnn2(x, h2)
|
144 |
+
|
145 |
+
x = x + res
|
146 |
+
x = torch.cat([x, a3], dim=2)
|
147 |
+
x = F.relu(self.fc1(x))
|
148 |
+
|
149 |
+
x = torch.cat([x, a4], dim=2)
|
150 |
+
x = F.relu(self.fc2(x))
|
151 |
+
return self.fc3(x)
|
152 |
+
|
153 |
+
def generate(self, mels, batched, target, overlap, mu_law, progress_callback=None):
|
154 |
+
mu_law = mu_law if self.mode == 'RAW' else False
|
155 |
+
progress_callback = progress_callback or self.gen_display
|
156 |
+
|
157 |
+
self.eval()
|
158 |
+
output = []
|
159 |
+
start = time.time()
|
160 |
+
rnn1 = self.get_gru_cell(self.rnn1)
|
161 |
+
rnn2 = self.get_gru_cell(self.rnn2)
|
162 |
+
|
163 |
+
with torch.no_grad():
|
164 |
+
if torch.cuda.is_available():
|
165 |
+
mels = mels.cuda()
|
166 |
+
else:
|
167 |
+
mels = mels.cpu()
|
168 |
+
wave_len = (mels.size(-1) - 1) * self.hop_length
|
169 |
+
mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both')
|
170 |
+
mels, aux = self.upsample(mels.transpose(1, 2))
|
171 |
+
|
172 |
+
if batched:
|
173 |
+
mels = self.fold_with_overlap(mels, target, overlap)
|
174 |
+
aux = self.fold_with_overlap(aux, target, overlap)
|
175 |
+
|
176 |
+
b_size, seq_len, _ = mels.size()
|
177 |
+
|
178 |
+
if torch.cuda.is_available():
|
179 |
+
h1 = torch.zeros(b_size, self.rnn_dims).cuda()
|
180 |
+
h2 = torch.zeros(b_size, self.rnn_dims).cuda()
|
181 |
+
x = torch.zeros(b_size, 1).cuda()
|
182 |
+
else:
|
183 |
+
h1 = torch.zeros(b_size, self.rnn_dims).cpu()
|
184 |
+
h2 = torch.zeros(b_size, self.rnn_dims).cpu()
|
185 |
+
x = torch.zeros(b_size, 1).cpu()
|
186 |
+
|
187 |
+
d = self.aux_dims
|
188 |
+
aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]
|
189 |
+
|
190 |
+
for i in range(seq_len):
|
191 |
+
|
192 |
+
m_t = mels[:, i, :]
|
193 |
+
|
194 |
+
a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
|
195 |
+
|
196 |
+
x = torch.cat([x, m_t, a1_t], dim=1)
|
197 |
+
x = self.I(x)
|
198 |
+
h1 = rnn1(x, h1)
|
199 |
+
|
200 |
+
x = x + h1
|
201 |
+
inp = torch.cat([x, a2_t], dim=1)
|
202 |
+
h2 = rnn2(inp, h2)
|
203 |
+
|
204 |
+
x = x + h2
|
205 |
+
x = torch.cat([x, a3_t], dim=1)
|
206 |
+
x = F.relu(self.fc1(x))
|
207 |
+
|
208 |
+
x = torch.cat([x, a4_t], dim=1)
|
209 |
+
x = F.relu(self.fc2(x))
|
210 |
+
|
211 |
+
logits = self.fc3(x)
|
212 |
+
|
213 |
+
if self.mode == 'MOL':
|
214 |
+
sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2))
|
215 |
+
output.append(sample.view(-1))
|
216 |
+
if torch.cuda.is_available():
|
217 |
+
# x = torch.FloatTensor([[sample]]).cuda()
|
218 |
+
x = sample.transpose(0, 1).cuda()
|
219 |
+
else:
|
220 |
+
x = sample.transpose(0, 1)
|
221 |
+
|
222 |
+
elif self.mode == 'RAW' :
|
223 |
+
posterior = F.softmax(logits, dim=1)
|
224 |
+
distrib = torch.distributions.Categorical(posterior)
|
225 |
+
|
226 |
+
sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1.
|
227 |
+
output.append(sample)
|
228 |
+
x = sample.unsqueeze(-1)
|
229 |
+
else:
|
230 |
+
raise RuntimeError("Unknown model mode value - ", self.mode)
|
231 |
+
|
232 |
+
if i % 100 == 0:
|
233 |
+
gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
|
234 |
+
progress_callback(i, seq_len, b_size, gen_rate)
|
235 |
+
|
236 |
+
output = torch.stack(output).transpose(0, 1)
|
237 |
+
output = output.cpu().numpy()
|
238 |
+
output = output.astype(np.float64)
|
239 |
+
|
240 |
+
if batched:
|
241 |
+
output = self.xfade_and_unfold(output, target, overlap)
|
242 |
+
else:
|
243 |
+
output = output[0]
|
244 |
+
|
245 |
+
if mu_law:
|
246 |
+
output = decode_mu_law(output, self.n_classes, False)
|
247 |
+
if hp.apply_preemphasis:
|
248 |
+
output = de_emphasis(output)
|
249 |
+
|
250 |
+
# Fade-out at the end to avoid signal cutting out suddenly
|
251 |
+
fade_out = np.linspace(1, 0, 20 * self.hop_length)
|
252 |
+
output = output[:wave_len]
|
253 |
+
output[-20 * self.hop_length:] *= fade_out
|
254 |
+
|
255 |
+
self.train()
|
256 |
+
|
257 |
+
return output
|
258 |
+
|
259 |
+
|
260 |
+
def gen_display(self, i, seq_len, b_size, gen_rate):
|
261 |
+
pbar = progbar(i, seq_len)
|
262 |
+
msg = f'| {pbar} {i*b_size}/{seq_len*b_size} | Batch Size: {b_size} | Gen Rate: {gen_rate:.1f}kHz | '
|
263 |
+
stream(msg)
|
264 |
+
|
265 |
+
def get_gru_cell(self, gru):
|
266 |
+
gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size)
|
267 |
+
gru_cell.weight_hh.data = gru.weight_hh_l0.data
|
268 |
+
gru_cell.weight_ih.data = gru.weight_ih_l0.data
|
269 |
+
gru_cell.bias_hh.data = gru.bias_hh_l0.data
|
270 |
+
gru_cell.bias_ih.data = gru.bias_ih_l0.data
|
271 |
+
return gru_cell
|
272 |
+
|
273 |
+
def pad_tensor(self, x, pad, side='both'):
|
274 |
+
# NB - this is just a quick method i need right now
|
275 |
+
# i.e., it won't generalise to other shapes/dims
|
276 |
+
b, t, c = x.size()
|
277 |
+
total = t + 2 * pad if side == 'both' else t + pad
|
278 |
+
if torch.cuda.is_available():
|
279 |
+
padded = torch.zeros(b, total, c).cuda()
|
280 |
+
else:
|
281 |
+
padded = torch.zeros(b, total, c).cpu()
|
282 |
+
if side == 'before' or side == 'both':
|
283 |
+
padded[:, pad:pad + t, :] = x
|
284 |
+
elif side == 'after':
|
285 |
+
padded[:, :t, :] = x
|
286 |
+
return padded
|
287 |
+
|
288 |
+
def fold_with_overlap(self, x, target, overlap):
|
289 |
+
|
290 |
+
''' Fold the tensor with overlap for quick batched inference.
|
291 |
+
Overlap will be used for crossfading in xfade_and_unfold()
|
292 |
+
|
293 |
+
Args:
|
294 |
+
x (tensor) : Upsampled conditioning features.
|
295 |
+
shape=(1, timesteps, features)
|
296 |
+
target (int) : Target timesteps for each index of batch
|
297 |
+
overlap (int) : Timesteps for both xfade and rnn warmup
|
298 |
+
|
299 |
+
Return:
|
300 |
+
(tensor) : shape=(num_folds, target + 2 * overlap, features)
|
301 |
+
|
302 |
+
Details:
|
303 |
+
x = [[h1, h2, ... hn]]
|
304 |
+
|
305 |
+
Where each h is a vector of conditioning features
|
306 |
+
|
307 |
+
Eg: target=2, overlap=1 with x.size(1)=10
|
308 |
+
|
309 |
+
folded = [[h1, h2, h3, h4],
|
310 |
+
[h4, h5, h6, h7],
|
311 |
+
[h7, h8, h9, h10]]
|
312 |
+
'''
|
313 |
+
|
314 |
+
_, total_len, features = x.size()
|
315 |
+
|
316 |
+
# Calculate variables needed
|
317 |
+
num_folds = (total_len - overlap) // (target + overlap)
|
318 |
+
extended_len = num_folds * (overlap + target) + overlap
|
319 |
+
remaining = total_len - extended_len
|
320 |
+
|
321 |
+
# Pad if some time steps poking out
|
322 |
+
if remaining != 0:
|
323 |
+
num_folds += 1
|
324 |
+
padding = target + 2 * overlap - remaining
|
325 |
+
x = self.pad_tensor(x, padding, side='after')
|
326 |
+
|
327 |
+
if torch.cuda.is_available():
|
328 |
+
folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda()
|
329 |
+
else:
|
330 |
+
folded = torch.zeros(num_folds, target + 2 * overlap, features).cpu()
|
331 |
+
|
332 |
+
# Get the values for the folded tensor
|
333 |
+
for i in range(num_folds):
|
334 |
+
start = i * (target + overlap)
|
335 |
+
end = start + target + 2 * overlap
|
336 |
+
folded[i] = x[:, start:end, :]
|
337 |
+
|
338 |
+
return folded
|
339 |
+
|
340 |
+
def xfade_and_unfold(self, y, target, overlap):
|
341 |
+
|
342 |
+
''' Applies a crossfade and unfolds into a 1d array.
|
343 |
+
|
344 |
+
Args:
|
345 |
+
y (ndarry) : Batched sequences of audio samples
|
346 |
+
shape=(num_folds, target + 2 * overlap)
|
347 |
+
dtype=np.float64
|
348 |
+
overlap (int) : Timesteps for both xfade and rnn warmup
|
349 |
+
|
350 |
+
Return:
|
351 |
+
(ndarry) : audio samples in a 1d array
|
352 |
+
shape=(total_len)
|
353 |
+
dtype=np.float64
|
354 |
+
|
355 |
+
Details:
|
356 |
+
y = [[seq1],
|
357 |
+
[seq2],
|
358 |
+
[seq3]]
|
359 |
+
|
360 |
+
Apply a gain envelope at both ends of the sequences
|
361 |
+
|
362 |
+
y = [[seq1_in, seq1_target, seq1_out],
|
363 |
+
[seq2_in, seq2_target, seq2_out],
|
364 |
+
[seq3_in, seq3_target, seq3_out]]
|
365 |
+
|
366 |
+
Stagger and add up the groups of samples:
|
367 |
+
|
368 |
+
[seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
|
369 |
+
|
370 |
+
'''
|
371 |
+
|
372 |
+
num_folds, length = y.shape
|
373 |
+
target = length - 2 * overlap
|
374 |
+
total_len = num_folds * (target + overlap) + overlap
|
375 |
+
|
376 |
+
# Need some silence for the rnn warmup
|
377 |
+
silence_len = overlap // 2
|
378 |
+
fade_len = overlap - silence_len
|
379 |
+
silence = np.zeros((silence_len), dtype=np.float64)
|
380 |
+
|
381 |
+
# Equal power crossfade
|
382 |
+
t = np.linspace(-1, 1, fade_len, dtype=np.float64)
|
383 |
+
fade_in = np.sqrt(0.5 * (1 + t))
|
384 |
+
fade_out = np.sqrt(0.5 * (1 - t))
|
385 |
+
|
386 |
+
# Concat the silence to the fades
|
387 |
+
fade_in = np.concatenate([silence, fade_in])
|
388 |
+
fade_out = np.concatenate([fade_out, silence])
|
389 |
+
|
390 |
+
# Apply the gain to the overlap samples
|
391 |
+
y[:, :overlap] *= fade_in
|
392 |
+
y[:, -overlap:] *= fade_out
|
393 |
+
|
394 |
+
unfolded = np.zeros((total_len), dtype=np.float64)
|
395 |
+
|
396 |
+
# Loop to add up all the samples
|
397 |
+
for i in range(num_folds):
|
398 |
+
start = i * (target + overlap)
|
399 |
+
end = start + target + 2 * overlap
|
400 |
+
unfolded[start:end] += y[i]
|
401 |
+
|
402 |
+
return unfolded
|
403 |
+
|
404 |
+
def get_step(self) :
|
405 |
+
return self.step.data.item()
|
406 |
+
|
407 |
+
def checkpoint(self, model_dir, optimizer) :
|
408 |
+
k_steps = self.get_step() // 1000
|
409 |
+
self.save(model_dir.joinpath("checkpoint_%dk_steps.pt" % k_steps), optimizer)
|
410 |
+
|
411 |
+
def log(self, path, msg) :
|
412 |
+
with open(path, 'a') as f:
|
413 |
+
print(msg, file=f)
|
414 |
+
|
415 |
+
def load(self, path, optimizer) :
|
416 |
+
checkpoint = torch.load(path)
|
417 |
+
if "optimizer_state" in checkpoint:
|
418 |
+
self.load_state_dict(checkpoint["model_state"])
|
419 |
+
optimizer.load_state_dict(checkpoint["optimizer_state"])
|
420 |
+
else:
|
421 |
+
# Backwards compatibility
|
422 |
+
self.load_state_dict(checkpoint)
|
423 |
+
|
424 |
+
def save(self, path, optimizer) :
|
425 |
+
torch.save({
|
426 |
+
"model_state": self.state_dict(),
|
427 |
+
"optimizer_state": optimizer.state_dict(),
|
428 |
+
}, path)
|
429 |
+
|
430 |
+
def num_params(self, print_out=True):
|
431 |
+
parameters = filter(lambda p: p.requires_grad, self.parameters())
|
432 |
+
parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
|
433 |
+
if print_out :
|
434 |
+
print('Trainable Parameters: %.3fM' % parameters)
|
gen_wavernn.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from vocoder.models.fatchord_version import WaveRNN
|
2 |
+
from vocoder.audio import *
|
3 |
+
|
4 |
+
|
5 |
+
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path):
|
6 |
+
k = model.get_step() // 1000
|
7 |
+
|
8 |
+
for i, (m, x) in enumerate(test_set, 1):
|
9 |
+
if i > samples:
|
10 |
+
break
|
11 |
+
|
12 |
+
print('\n| Generating: %i/%i' % (i, samples))
|
13 |
+
|
14 |
+
x = x[0].numpy()
|
15 |
+
|
16 |
+
bits = 16 if hp.voc_mode == 'MOL' else hp.bits
|
17 |
+
|
18 |
+
if hp.mu_law and hp.voc_mode != 'MOL' :
|
19 |
+
x = decode_mu_law(x, 2**bits, from_labels=True)
|
20 |
+
else :
|
21 |
+
x = label_2_float(x, bits)
|
22 |
+
|
23 |
+
save_wav(x, save_path.joinpath("%dk_steps_%d_target.wav" % (k, i)))
|
24 |
+
|
25 |
+
batch_str = "gen_batched_target%d_overlap%d" % (target, overlap) if batched else \
|
26 |
+
"gen_not_batched"
|
27 |
+
save_str = save_path.joinpath("%dk_steps_%d_%s.wav" % (k, i, batch_str))
|
28 |
+
|
29 |
+
wav = model.generate(m, batched, target, overlap, hp.mu_law)
|
30 |
+
save_wav(wav, save_str)
|
31 |
+
|
hparams.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from synthesizer.hparams import hparams as _syn_hp
|
2 |
+
|
3 |
+
|
4 |
+
# Audio settings------------------------------------------------------------------------
|
5 |
+
# Match the values of the synthesizer
|
6 |
+
sample_rate = _syn_hp.sample_rate
|
7 |
+
n_fft = _syn_hp.n_fft
|
8 |
+
num_mels = _syn_hp.num_mels
|
9 |
+
hop_length = _syn_hp.hop_size
|
10 |
+
win_length = _syn_hp.win_size
|
11 |
+
fmin = _syn_hp.fmin
|
12 |
+
min_level_db = _syn_hp.min_level_db
|
13 |
+
ref_level_db = _syn_hp.ref_level_db
|
14 |
+
mel_max_abs_value = _syn_hp.max_abs_value
|
15 |
+
preemphasis = _syn_hp.preemphasis
|
16 |
+
apply_preemphasis = _syn_hp.preemphasize
|
17 |
+
|
18 |
+
bits = 9 # bit depth of signal
|
19 |
+
mu_law = True # Recommended to suppress noise if using raw bits in hp.voc_mode
|
20 |
+
# below
|
21 |
+
|
22 |
+
|
23 |
+
# WAVERNN / VOCODER --------------------------------------------------------------------------------
|
24 |
+
voc_mode = 'RAW' # either 'RAW' (softmax on raw bits) or 'MOL' (sample from
|
25 |
+
# mixture of logistics)
|
26 |
+
voc_upsample_factors = (5, 5, 8) # NB - this needs to correctly factorise hop_length
|
27 |
+
voc_rnn_dims = 512
|
28 |
+
voc_fc_dims = 512
|
29 |
+
voc_compute_dims = 128
|
30 |
+
voc_res_out_dims = 128
|
31 |
+
voc_res_blocks = 10
|
32 |
+
|
33 |
+
# Training
|
34 |
+
voc_batch_size = 100
|
35 |
+
voc_lr = 1e-4
|
36 |
+
voc_gen_at_checkpoint = 5 # number of samples to generate at each checkpoint
|
37 |
+
voc_pad = 2 # this will pad the input so that the resnet can 'see' wider
|
38 |
+
# than input length
|
39 |
+
voc_seq_len = hop_length * 5 # must be a multiple of hop_length
|
40 |
+
|
41 |
+
# Generating / Synthesizing
|
42 |
+
voc_gen_batched = True # very fast (realtime+) single utterance batched generation
|
43 |
+
voc_target = 8000 # target number of samples to be generated in each batch entry
|
44 |
+
voc_overlap = 400 # number of samples for crossfading between batches
|
inference.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from vocoder.models.fatchord_version import WaveRNN
|
2 |
+
from vocoder import hparams as hp
|
3 |
+
import torch
|
4 |
+
|
5 |
+
|
6 |
+
_model = None # type: WaveRNN
|
7 |
+
|
8 |
+
def load_model(weights_fpath, verbose=True):
|
9 |
+
global _model, _device
|
10 |
+
|
11 |
+
if verbose:
|
12 |
+
print("Building Wave-RNN")
|
13 |
+
_model = WaveRNN(
|
14 |
+
rnn_dims=hp.voc_rnn_dims,
|
15 |
+
fc_dims=hp.voc_fc_dims,
|
16 |
+
bits=hp.bits,
|
17 |
+
pad=hp.voc_pad,
|
18 |
+
upsample_factors=hp.voc_upsample_factors,
|
19 |
+
feat_dims=hp.num_mels,
|
20 |
+
compute_dims=hp.voc_compute_dims,
|
21 |
+
res_out_dims=hp.voc_res_out_dims,
|
22 |
+
res_blocks=hp.voc_res_blocks,
|
23 |
+
hop_length=hp.hop_length,
|
24 |
+
sample_rate=hp.sample_rate,
|
25 |
+
mode=hp.voc_mode
|
26 |
+
)
|
27 |
+
|
28 |
+
if torch.cuda.is_available():
|
29 |
+
_model = _model.cuda()
|
30 |
+
_device = torch.device('cuda')
|
31 |
+
else:
|
32 |
+
_device = torch.device('cpu')
|
33 |
+
|
34 |
+
if verbose:
|
35 |
+
print("Loading model weights at %s" % weights_fpath)
|
36 |
+
checkpoint = torch.load(weights_fpath, _device)
|
37 |
+
_model.load_state_dict(checkpoint['model_state'])
|
38 |
+
_model.eval()
|
39 |
+
|
40 |
+
|
41 |
+
def is_loaded():
|
42 |
+
return _model is not None
|
43 |
+
|
44 |
+
|
45 |
+
def infer_waveform(mel, normalize=True, batched=True, target=8000, overlap=800,
|
46 |
+
progress_callback=None):
|
47 |
+
"""
|
48 |
+
Infers the waveform of a mel spectrogram output by the synthesizer (the format must match
|
49 |
+
that of the synthesizer!)
|
50 |
+
|
51 |
+
:param normalize:
|
52 |
+
:param batched:
|
53 |
+
:param target:
|
54 |
+
:param overlap:
|
55 |
+
:return:
|
56 |
+
"""
|
57 |
+
if _model is None:
|
58 |
+
raise Exception("Please load Wave-RNN in memory before using it")
|
59 |
+
|
60 |
+
if normalize:
|
61 |
+
mel = mel / hp.mel_max_abs_value
|
62 |
+
mel = torch.from_numpy(mel[None, ...])
|
63 |
+
wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback)
|
64 |
+
return wav
|
ko_dictionary.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding: utf-8
|
2 |
+
|
3 |
+
etc_dictionary = {
|
4 |
+
'2 30대': '이삼십대',
|
5 |
+
'20~30대': '이삼십대',
|
6 |
+
'20, 30대': '이십대 삼십대',
|
7 |
+
'1+1': '원플러스원',
|
8 |
+
'3에서 6개월인': '3개월에서 육개월인',
|
9 |
+
}
|
10 |
+
|
11 |
+
english_dictionary = {
|
12 |
+
'Devsisters': '데브시스터즈',
|
13 |
+
'track': '트랙',
|
14 |
+
|
15 |
+
# krbook
|
16 |
+
'LA': '엘에이',
|
17 |
+
'LG': '엘지',
|
18 |
+
'KOREA': '코리아',
|
19 |
+
'JSA': '제이에스에이',
|
20 |
+
'PGA': '피지에이',
|
21 |
+
'GA': '지에이',
|
22 |
+
'idol': '아이돌',
|
23 |
+
'KTX': '케이티엑스',
|
24 |
+
'AC': '에이씨',
|
25 |
+
'DVD': '디비디',
|
26 |
+
'US': '유에스',
|
27 |
+
'CNN': '씨엔엔',
|
28 |
+
'LPGA': '엘피지에이',
|
29 |
+
'P': '피',
|
30 |
+
'L': '엘',
|
31 |
+
'T': '티',
|
32 |
+
'B': '비',
|
33 |
+
'C': '씨',
|
34 |
+
'BIFF': '비아이에프에프',
|
35 |
+
'GV': '지비',
|
36 |
+
|
37 |
+
# JTBC
|
38 |
+
'IT': '아이티',
|
39 |
+
'IQ': '아이큐',
|
40 |
+
'JTBC': '제이티비씨',
|
41 |
+
'trickle down effect': '트리클 다운 이펙트',
|
42 |
+
'trickle up effect': '트리클 업 이펙트',
|
43 |
+
'down': '다운',
|
44 |
+
'up': '업',
|
45 |
+
'FCK': '에프씨케이',
|
46 |
+
'AP': '에이피',
|
47 |
+
'WHERETHEWILDTHINGSARE': '',
|
48 |
+
'Rashomon Effect': '',
|
49 |
+
'O': '오',
|
50 |
+
'OO': '오오',
|
51 |
+
'B': '비',
|
52 |
+
'GDP': '지디피',
|
53 |
+
'CIPA': '씨아이피에이',
|
54 |
+
'YS': '와이에스',
|
55 |
+
'Y': '와이',
|
56 |
+
'S': '에스',
|
57 |
+
'JTBC': '제이티비씨',
|
58 |
+
'PC': '피씨',
|
59 |
+
'bill': '빌',
|
60 |
+
'Halmuny': '하모니', #####
|
61 |
+
'X': '엑스',
|
62 |
+
'SNS': '에스엔에스',
|
63 |
+
'ability': '어빌리티',
|
64 |
+
'shy': '',
|
65 |
+
'CCTV': '씨씨티비',
|
66 |
+
'IT': '아이티',
|
67 |
+
'the tenth man': '더 텐쓰 맨', ####
|
68 |
+
'L': '엘',
|
69 |
+
'PC': '피씨',
|
70 |
+
'YSDJJPMB': '', ########
|
71 |
+
'Content Attitude Timing': '컨텐트 애티튜드 타이밍',
|
72 |
+
'CAT': '캣',
|
73 |
+
'IS': '아이에스',
|
74 |
+
'SNS': '에스엔에스',
|
75 |
+
'K': '케이',
|
76 |
+
'Y': '와이',
|
77 |
+
'KDI': '케이디아이',
|
78 |
+
'DOC': '디오씨',
|
79 |
+
'CIA': '씨아이에이',
|
80 |
+
'PBS': '피비에스',
|
81 |
+
'D': '디',
|
82 |
+
'PPropertyPositionPowerPrisonP'
|
83 |
+
'S': '에스',
|
84 |
+
'francisco': '프란시스코',
|
85 |
+
'I': '아이',
|
86 |
+
'III': '아이아이', ######
|
87 |
+
'No joke': '노 조크',
|
88 |
+
'BBK': '비비케이',
|
89 |
+
'LA': '엘에이',
|
90 |
+
'Don': '',
|
91 |
+
't worry be happy': ' 워리 비 해피',
|
92 |
+
'NO': '엔오', #####
|
93 |
+
'it was our sky': '잇 워즈 아워 스카이',
|
94 |
+
'it is our sky': '잇 이즈 아워 스카이', ####
|
95 |
+
'NEIS': '엔이아이에스', #####
|
96 |
+
'IMF': '아이엠에프',
|
97 |
+
'apology': '어폴로지',
|
98 |
+
'humble': '험블',
|
99 |
+
'M': '엠',
|
100 |
+
'Nowhere Man': '노웨어 맨',
|
101 |
+
'The Tenth Man': '더 텐쓰 맨',
|
102 |
+
'PBS': '피비에스',
|
103 |
+
'BBC': '비비씨',
|
104 |
+
'MRJ': '엠알제이',
|
105 |
+
'CCTV': '씨씨티비',
|
106 |
+
'Pick me up': '픽 미 업',
|
107 |
+
'DNA': '디엔에이',
|
108 |
+
'UN': '유엔',
|
109 |
+
'STOP': '스탑', #####
|
110 |
+
'PRESS': '프레스', #####
|
111 |
+
'not to be': '낫 투비',
|
112 |
+
'Denial': '디나이얼',
|
113 |
+
'G': '지',
|
114 |
+
'IMF': '아이엠에프',
|
115 |
+
'GDP': '지디피',
|
116 |
+
'JTBC': '제이티비씨',
|
117 |
+
'Time flies like an arrow': '타임 플라이즈 라이크 언 애로우',
|
118 |
+
'DDT': '디디티',
|
119 |
+
'AI': '에이아이',
|
120 |
+
'Z': '제트',
|
121 |
+
'OECD': '오이씨디',
|
122 |
+
'N': '앤',
|
123 |
+
'A': '에이',
|
124 |
+
'MB': '엠비',
|
125 |
+
'EH': '이에이치',
|
126 |
+
'IS': '아이에스',
|
127 |
+
'TV': '티비',
|
128 |
+
'MIT': '엠아이티',
|
129 |
+
'KBO': '케이비오',
|
130 |
+
'I love America': '아이 러브 아메리카',
|
131 |
+
'SF': '에스에프',
|
132 |
+
'Q': '큐',
|
133 |
+
'KFX': '케이에프엑스',
|
134 |
+
'PM': '피엠',
|
135 |
+
'Prime Minister': '프라임 미니스터',
|
136 |
+
'Swordline': '스워드라인',
|
137 |
+
'TBS': '티비에스',
|
138 |
+
'DDT': '디디티',
|
139 |
+
'CS': '씨에스',
|
140 |
+
'Reflecting Absence': '리플렉팅 앱센스',
|
141 |
+
'PBS': '피비에스',
|
142 |
+
'Drum being beaten by everyone': '드럼 빙 비튼 바이 에브리원',
|
143 |
+
'negative pressure': '네거티브 프레셔',
|
144 |
+
'F': '에프',
|
145 |
+
'KIA': '기아',
|
146 |
+
'FTA': '에프티에이',
|
147 |
+
'Que sais-je': '',
|
148 |
+
'UFC': '유에프씨',
|
149 |
+
'P': '피',
|
150 |
+
'DJ': '디제이',
|
151 |
+
'Chaebol': '채벌',
|
152 |
+
'BBC': '비비씨',
|
153 |
+
'OECD': '오이씨디',
|
154 |
+
'BC': '삐씨',
|
155 |
+
'C': '씨',
|
156 |
+
'B': '씨',
|
157 |
+
'KY': '케이와이',
|
158 |
+
'K': '케이',
|
159 |
+
'CEO': '씨이오',
|
160 |
+
'YH': '와이에치',
|
161 |
+
'IS': '아이에스',
|
162 |
+
'who are you': '후 얼 유',
|
163 |
+
'Y': '와이',
|
164 |
+
'The Devils Advocate': '더 데빌즈 어드보카트',
|
165 |
+
'YS': '와이에스',
|
166 |
+
'so sorry': '쏘 쏘리',
|
167 |
+
'Santa': '산타',
|
168 |
+
'Big Endian': '빅 엔디안',
|
169 |
+
'Small Endian': '스몰 엔디안',
|
170 |
+
'Oh Captain My Captain': '오 캡틴 마이 캡틴',
|
171 |
+
'AIB': '에이아이비',
|
172 |
+
'K': '케이',
|
173 |
+
'PBS': '피비에스',
|
174 |
+
}
|
korean.py
ADDED
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding: utf-8
|
2 |
+
# Code based on
|
3 |
+
|
4 |
+
import re
|
5 |
+
import os
|
6 |
+
import ast
|
7 |
+
import json
|
8 |
+
from jamo import hangul_to_jamo, h2j, j2h
|
9 |
+
|
10 |
+
from .ko_dictionary import english_dictionary, etc_dictionary
|
11 |
+
|
12 |
+
PAD = '_'
|
13 |
+
EOS = '~'
|
14 |
+
PUNC = '!\'(),-.:;?'
|
15 |
+
SPACE = ' '
|
16 |
+
|
17 |
+
JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)])
|
18 |
+
JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)])
|
19 |
+
JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)])
|
20 |
+
|
21 |
+
VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE
|
22 |
+
ALL_SYMBOLS = PAD + EOS + VALID_CHARS
|
23 |
+
|
24 |
+
char_to_id = {c: i for i, c in enumerate(ALL_SYMBOLS)}
|
25 |
+
id_to_char = {i: c for i, c in enumerate(ALL_SYMBOLS)}
|
26 |
+
|
27 |
+
quote_checker = """([`"'"“‘])(.+?)([`"'"”’])"""
|
28 |
+
|
29 |
+
|
30 |
+
def is_lead(char):
|
31 |
+
return char in JAMO_LEADS
|
32 |
+
|
33 |
+
|
34 |
+
def is_vowel(char):
|
35 |
+
return char in JAMO_VOWELS
|
36 |
+
|
37 |
+
|
38 |
+
def is_tail(char):
|
39 |
+
return char in JAMO_TAILS
|
40 |
+
|
41 |
+
|
42 |
+
def get_mode(char):
|
43 |
+
if is_lead(char):
|
44 |
+
return 0
|
45 |
+
elif is_vowel(char):
|
46 |
+
return 1
|
47 |
+
elif is_tail(char):
|
48 |
+
return 2
|
49 |
+
else:
|
50 |
+
return -1
|
51 |
+
|
52 |
+
|
53 |
+
def _get_text_from_candidates(candidates):
|
54 |
+
if len(candidates) == 0:
|
55 |
+
return ""
|
56 |
+
elif len(candidates) == 1:
|
57 |
+
return _jamo_char_to_hcj(candidates[0])
|
58 |
+
else:
|
59 |
+
return j2h(**dict(zip(["lead", "vowel", "tail"], candidates)))
|
60 |
+
|
61 |
+
|
62 |
+
def jamo_to_korean(text):
|
63 |
+
text = h2j(text)
|
64 |
+
|
65 |
+
idx = 0
|
66 |
+
new_text = ""
|
67 |
+
candidates = []
|
68 |
+
|
69 |
+
while True:
|
70 |
+
if idx >= len(text):
|
71 |
+
new_text += _get_text_from_candidates(candidates)
|
72 |
+
break
|
73 |
+
|
74 |
+
char = text[idx]
|
75 |
+
mode = get_mode(char)
|
76 |
+
|
77 |
+
if mode == 0:
|
78 |
+
new_text += _get_text_from_candidates(candidates)
|
79 |
+
candidates = [char]
|
80 |
+
elif mode == -1:
|
81 |
+
new_text += _get_text_from_candidates(candidates)
|
82 |
+
new_text += char
|
83 |
+
candidates = []
|
84 |
+
else:
|
85 |
+
candidates.append(char)
|
86 |
+
|
87 |
+
idx += 1
|
88 |
+
return new_text
|
89 |
+
|
90 |
+
|
91 |
+
num_to_kor = {
|
92 |
+
'0': '영',
|
93 |
+
'1': '일',
|
94 |
+
'2': '이',
|
95 |
+
'3': '삼',
|
96 |
+
'4': '사',
|
97 |
+
'5': '오',
|
98 |
+
'6': '육',
|
99 |
+
'7': '칠',
|
100 |
+
'8': '팔',
|
101 |
+
'9': '구',
|
102 |
+
}
|
103 |
+
|
104 |
+
unit_to_kor1 = {
|
105 |
+
'%': '퍼센트',
|
106 |
+
'cm': '센치미터',
|
107 |
+
'mm': '밀리미터',
|
108 |
+
'km': '킬로미터',
|
109 |
+
'kg': '킬로그람',
|
110 |
+
}
|
111 |
+
unit_to_kor2 = {
|
112 |
+
'm': '미터',
|
113 |
+
}
|
114 |
+
|
115 |
+
upper_to_kor = {
|
116 |
+
'A': '에이',
|
117 |
+
'B': '비',
|
118 |
+
'C': '씨',
|
119 |
+
'D': '디',
|
120 |
+
'E': '이',
|
121 |
+
'F': '에프',
|
122 |
+
'G': '지',
|
123 |
+
'H': '에이치',
|
124 |
+
'I': '아이',
|
125 |
+
'J': '제이',
|
126 |
+
'K': '케이',
|
127 |
+
'L': '엘',
|
128 |
+
'M': '엠',
|
129 |
+
'N': '엔',
|
130 |
+
'O': '오',
|
131 |
+
'P': '피',
|
132 |
+
'Q': '큐',
|
133 |
+
'R': '알',
|
134 |
+
'S': '에스',
|
135 |
+
'T': '티',
|
136 |
+
'U': '유',
|
137 |
+
'V': '브이',
|
138 |
+
'W': '더블유',
|
139 |
+
'X': '엑스',
|
140 |
+
'Y': '와이',
|
141 |
+
'Z': '지',
|
142 |
+
}
|
143 |
+
|
144 |
+
|
145 |
+
def compare_sentence_with_jamo(text1, text2):
|
146 |
+
return h2j(text1) != h2j(text2)
|
147 |
+
|
148 |
+
|
149 |
+
def tokenize(text, as_id=False):
|
150 |
+
# jamo package에 있는 hangul_to_jamo를 이용하여 한글 string을 초성/중성/종성으로 나눈다.
|
151 |
+
text = normalize(text)
|
152 |
+
tokens = list(hangul_to_jamo(text)) # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~']
|
153 |
+
|
154 |
+
if as_id:
|
155 |
+
return [char_to_id[token] for token in tokens] + [char_to_id[EOS]]
|
156 |
+
else:
|
157 |
+
return [token for token in tokens] + [EOS]
|
158 |
+
|
159 |
+
|
160 |
+
def tokenizer_fn(iterator):
|
161 |
+
return (token for x in iterator for token in tokenize(x, as_id=False))
|
162 |
+
|
163 |
+
|
164 |
+
def normalize(text):
|
165 |
+
text = text.strip()
|
166 |
+
|
167 |
+
text = re.sub('\(\d+일\)', '', text)
|
168 |
+
text = re.sub('\([⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+\)', '', text)
|
169 |
+
|
170 |
+
text = normalize_with_dictionary(text, etc_dictionary)
|
171 |
+
text = normalize_english(text)
|
172 |
+
text = re.sub('[a-zA-Z]+', normalize_upper, text)
|
173 |
+
|
174 |
+
text = normalize_quote(text)
|
175 |
+
text = normalize_number(text)
|
176 |
+
|
177 |
+
return text
|
178 |
+
|
179 |
+
|
180 |
+
def normalize_with_dictionary(text, dic):
|
181 |
+
if any(key in text for key in dic.keys()):
|
182 |
+
pattern = re.compile('|'.join(re.escape(key) for key in dic.keys()))
|
183 |
+
return pattern.sub(lambda x: dic[x.group()], text)
|
184 |
+
else:
|
185 |
+
return text
|
186 |
+
|
187 |
+
|
188 |
+
def normalize_english(text):
|
189 |
+
def fn(m):
|
190 |
+
word = m.group()
|
191 |
+
if word in english_dictionary:
|
192 |
+
return english_dictionary.get(word)
|
193 |
+
else:
|
194 |
+
return word
|
195 |
+
|
196 |
+
text = re.sub("([A-Za-z]+)", fn, text)
|
197 |
+
return text
|
198 |
+
|
199 |
+
|
200 |
+
def normalize_upper(text):
|
201 |
+
text = text.group(0)
|
202 |
+
|
203 |
+
if all([char.isupper() for char in text]):
|
204 |
+
return "".join(upper_to_kor[char] for char in text)
|
205 |
+
else:
|
206 |
+
return text
|
207 |
+
|
208 |
+
|
209 |
+
def normalize_quote(text):
|
210 |
+
def fn(found_text):
|
211 |
+
from nltk import sent_tokenize # NLTK doesn't along with multiprocessing
|
212 |
+
|
213 |
+
found_text = found_text.group()
|
214 |
+
unquoted_text = found_text[1:-1]
|
215 |
+
|
216 |
+
sentences = sent_tokenize(unquoted_text)
|
217 |
+
return " ".join(["'{}'".format(sent) for sent in sentences])
|
218 |
+
|
219 |
+
return re.sub(quote_checker, fn, text)
|
220 |
+
|
221 |
+
|
222 |
+
number_checker = "([+-]?\d[\d,]*)[\.]?\d*"
|
223 |
+
count_checker = "(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)"
|
224 |
+
|
225 |
+
|
226 |
+
def normalize_number(text):
|
227 |
+
text = normalize_with_dictionary(text, unit_to_kor1)
|
228 |
+
text = normalize_with_dictionary(text, unit_to_kor2)
|
229 |
+
text = re.sub(number_checker + count_checker,
|
230 |
+
lambda x: number_to_korean(x, True), text)
|
231 |
+
text = re.sub(number_checker,
|
232 |
+
lambda x: number_to_korean(x, False), text)
|
233 |
+
return text
|
234 |
+
|
235 |
+
|
236 |
+
num_to_kor1 = [""] + list("일이삼사오육칠팔구")
|
237 |
+
num_to_kor2 = [""] + list("만억조경해")
|
238 |
+
num_to_kor3 = [""] + list("십백천")
|
239 |
+
|
240 |
+
# count_to_kor1 = [""] + ["하나","둘","셋","넷","다섯","여섯","일곱","여덟","아홉"]
|
241 |
+
count_to_kor1 = [""] + ["한", "두", "세", "네", "다섯", "여섯", "일곱", "여덟", "아홉"]
|
242 |
+
|
243 |
+
count_tenth_dict = {
|
244 |
+
"십": "열",
|
245 |
+
"두십": "스물",
|
246 |
+
"세십": "서른",
|
247 |
+
"네십": "마흔",
|
248 |
+
"다섯십": "쉰",
|
249 |
+
"여섯십": "예순",
|
250 |
+
"일곱십": "일흔",
|
251 |
+
"여덟십": "여든",
|
252 |
+
"아홉십": "아흔",
|
253 |
+
}
|
254 |
+
|
255 |
+
|
256 |
+
def number_to_korean(num_str, is_count=False):
|
257 |
+
if is_count:
|
258 |
+
num_str, unit_str = num_str.group(1), num_str.group(2)
|
259 |
+
else:
|
260 |
+
num_str, unit_str = num_str.group(), ""
|
261 |
+
|
262 |
+
num_str = num_str.replace(',', '')
|
263 |
+
# print("before ast : ", num_str, "dtype : ",type(num_str))
|
264 |
+
|
265 |
+
try:
|
266 |
+
num = ast.literal_eval(num_str)
|
267 |
+
# print("After ast :", num,"dtype : ",type(num))
|
268 |
+
except Exception:
|
269 |
+
num_str = re.sub('^0+', '', num_str)
|
270 |
+
num = ast.literal_eval(num_str)
|
271 |
+
|
272 |
+
if num == 0:
|
273 |
+
return "영"
|
274 |
+
|
275 |
+
check_float = num_str.split('.')
|
276 |
+
if len(check_float) == 2:
|
277 |
+
digit_str, float_str = check_float
|
278 |
+
elif len(check_float) >= 3:
|
279 |
+
raise Exception(" [!] Wrong number format")
|
280 |
+
else:
|
281 |
+
digit_str, float_str = check_float[0], None
|
282 |
+
|
283 |
+
if is_count and float_str is not None:
|
284 |
+
raise Exception(" [!] `is_count` and float number does not fit each other")
|
285 |
+
|
286 |
+
digit = int(digit_str)
|
287 |
+
|
288 |
+
if digit_str.startswith("-"):
|
289 |
+
digit, digit_str = abs(digit), str(abs(digit))
|
290 |
+
|
291 |
+
kor = ""
|
292 |
+
size = len(str(digit))
|
293 |
+
tmp = []
|
294 |
+
|
295 |
+
for i, v in enumerate(digit_str, start=1):
|
296 |
+
v = int(v)
|
297 |
+
|
298 |
+
if v != 0:
|
299 |
+
if is_count:
|
300 |
+
tmp += count_to_kor1[v]
|
301 |
+
else:
|
302 |
+
tmp += num_to_kor1[v]
|
303 |
+
|
304 |
+
tmp += num_to_kor3[(size - i) % 4]
|
305 |
+
|
306 |
+
if (size - i) % 4 == 0 and len(tmp) != 0:
|
307 |
+
kor += "".join(tmp)
|
308 |
+
tmp = []
|
309 |
+
kor += num_to_kor2[int((size - i) / 4)]
|
310 |
+
|
311 |
+
if is_count:
|
312 |
+
if kor.startswith("한") and len(kor) > 1:
|
313 |
+
kor = kor[1:]
|
314 |
+
|
315 |
+
if any(word in kor for word in count_tenth_dict):
|
316 |
+
kor = re.sub(
|
317 |
+
'|'.join(count_tenth_dict.keys()),
|
318 |
+
lambda x: count_tenth_dict[x.group()], kor)
|
319 |
+
|
320 |
+
if not is_count and kor.startswith("일") and len(kor) > 1:
|
321 |
+
kor = kor[1:]
|
322 |
+
|
323 |
+
if float_str is not None:
|
324 |
+
kor += "쩜 "
|
325 |
+
kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str)
|
326 |
+
|
327 |
+
if num_str.startswith("+"):
|
328 |
+
kor = "플러스 " + kor
|
329 |
+
elif num_str.startswith("-"):
|
330 |
+
kor = "마이너스 " + kor
|
331 |
+
|
332 |
+
return kor + unit_str
|
333 |
+
|
334 |
+
|
335 |
+
if __name__ == "__main__":
|
336 |
+
def test_normalize(text):
|
337 |
+
print(text)
|
338 |
+
print(normalize(text))
|
339 |
+
print("=" * 30)
|
340 |
+
|
341 |
+
|
342 |
+
test_normalize("JTBC는 JTBCs를 DY는 A가 Absolute")
|
343 |
+
test_normalize("오늘(13일) 3,600마리 강아지가")
|
344 |
+
test_normalize("60.3%")
|
345 |
+
test_normalize('"저돌"(猪突) 입니다.')
|
346 |
+
test_normalize('비대위원장이 지난 1월 이런 말을 했습니다. “난 그냥 산돼지처럼 돌파하는 스타일이다”')
|
347 |
+
test_normalize("지금은 -12.35%였고 종류는 5가지와 19가지, 그리고 55가지였다")
|
348 |
+
test_normalize("JTBC는 TH와 K 양이 2017년 9월 12일 오후 12시에 24살이 된다")
|
349 |
+
print(list(hangul_to_jamo(list(hangul_to_jamo('비대위원장이 지난 1월 이런 말을 했습니다? “난 그냥 산돼지처럼 돌파하는 스타일이다”')))))
|
logmmse.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# The MIT License (MIT)
|
2 |
+
#
|
3 |
+
# Copyright (c) 2015 braindead
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
#
|
23 |
+
#
|
24 |
+
# This code was extracted from the logmmse package (https://pypi.org/project/logmmse/) and I
|
25 |
+
# simply modified the interface to meet my needs.
|
26 |
+
|
27 |
+
|
28 |
+
import numpy as np
|
29 |
+
import math
|
30 |
+
from scipy.special import expn
|
31 |
+
from collections import namedtuple
|
32 |
+
|
33 |
+
NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2")
|
34 |
+
|
35 |
+
|
36 |
+
def profile_noise(noise, sampling_rate, window_size=0):
|
37 |
+
"""
|
38 |
+
Creates a profile of the noise in a given waveform.
|
39 |
+
|
40 |
+
:param noise: a waveform containing noise ONLY, as a numpy array of floats or ints.
|
41 |
+
:param sampling_rate: the sampling rate of the audio
|
42 |
+
:param window_size: the size of the window the logmmse algorithm operates on. A default value
|
43 |
+
will be picked if left as 0.
|
44 |
+
:return: a NoiseProfile object
|
45 |
+
"""
|
46 |
+
noise, dtype = to_float(noise)
|
47 |
+
noise += np.finfo(np.float64).eps
|
48 |
+
|
49 |
+
if window_size == 0:
|
50 |
+
window_size = int(math.floor(0.02 * sampling_rate))
|
51 |
+
|
52 |
+
if window_size % 2 == 1:
|
53 |
+
window_size = window_size + 1
|
54 |
+
|
55 |
+
perc = 50
|
56 |
+
len1 = int(math.floor(window_size * perc / 100))
|
57 |
+
len2 = int(window_size - len1)
|
58 |
+
|
59 |
+
win = np.hanning(window_size)
|
60 |
+
win = win * len2 / np.sum(win)
|
61 |
+
n_fft = 2 * window_size
|
62 |
+
|
63 |
+
noise_mean = np.zeros(n_fft)
|
64 |
+
n_frames = len(noise) // window_size
|
65 |
+
for j in range(0, window_size * n_frames, window_size):
|
66 |
+
noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0))
|
67 |
+
noise_mu2 = (noise_mean / n_frames) ** 2
|
68 |
+
|
69 |
+
return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2)
|
70 |
+
|
71 |
+
|
72 |
+
def denoise(wav, noise_profile: NoiseProfile, eta=0.15):
|
73 |
+
"""
|
74 |
+
Cleans the noise from a speech waveform given a noise profile. The waveform must have the
|
75 |
+
same sampling rate as the one used to create the noise profile.
|
76 |
+
|
77 |
+
:param wav: a speech waveform as a numpy array of floats or ints.
|
78 |
+
:param noise_profile: a NoiseProfile object that was created from a similar (or a segment of
|
79 |
+
the same) waveform.
|
80 |
+
:param eta: voice threshold for noise update. While the voice activation detection value is
|
81 |
+
below this threshold, the noise profile will be continuously updated throughout the audio.
|
82 |
+
Set to 0 to disable updating the noise profile.
|
83 |
+
:return: the clean wav as a numpy array of floats or ints of the same length.
|
84 |
+
"""
|
85 |
+
wav, dtype = to_float(wav)
|
86 |
+
wav += np.finfo(np.float64).eps
|
87 |
+
p = noise_profile
|
88 |
+
|
89 |
+
nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2))
|
90 |
+
x_final = np.zeros(nframes * p.len2)
|
91 |
+
|
92 |
+
aa = 0.98
|
93 |
+
mu = 0.98
|
94 |
+
ksi_min = 10 ** (-25 / 10)
|
95 |
+
|
96 |
+
x_old = np.zeros(p.len1)
|
97 |
+
xk_prev = np.zeros(p.len1)
|
98 |
+
noise_mu2 = p.noise_mu2
|
99 |
+
for k in range(0, nframes * p.len2, p.len2):
|
100 |
+
insign = p.win * wav[k:k + p.window_size]
|
101 |
+
|
102 |
+
spec = np.fft.fft(insign, p.n_fft, axis=0)
|
103 |
+
sig = np.absolute(spec)
|
104 |
+
sig2 = sig ** 2
|
105 |
+
|
106 |
+
gammak = np.minimum(sig2 / noise_mu2, 40)
|
107 |
+
|
108 |
+
if xk_prev.all() == 0:
|
109 |
+
ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
|
110 |
+
else:
|
111 |
+
ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
|
112 |
+
ksi = np.maximum(ksi_min, ksi)
|
113 |
+
|
114 |
+
log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi)
|
115 |
+
vad_decision = np.sum(log_sigma_k) / p.window_size
|
116 |
+
if vad_decision < eta:
|
117 |
+
noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
|
118 |
+
|
119 |
+
a = ksi / (1 + ksi)
|
120 |
+
vk = a * gammak
|
121 |
+
ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
|
122 |
+
hw = a * np.exp(ei_vk)
|
123 |
+
sig = sig * hw
|
124 |
+
xk_prev = sig ** 2
|
125 |
+
xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0)
|
126 |
+
xi_w = np.real(xi_w)
|
127 |
+
|
128 |
+
x_final[k:k + p.len2] = x_old + xi_w[0:p.len1]
|
129 |
+
x_old = xi_w[p.len1:p.window_size]
|
130 |
+
|
131 |
+
output = from_float(x_final, dtype)
|
132 |
+
output = np.pad(output, (0, len(wav) - len(output)), mode="constant")
|
133 |
+
return output
|
134 |
+
|
135 |
+
|
136 |
+
## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that
|
137 |
+
## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of
|
138 |
+
## webrctvad
|
139 |
+
# def vad(wav, sampling_rate, eta=0.15, window_size=0):
|
140 |
+
# """
|
141 |
+
# TODO: fix doc
|
142 |
+
# Creates a profile of the noise in a given waveform.
|
143 |
+
#
|
144 |
+
# :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints.
|
145 |
+
# :param sampling_rate: the sampling rate of the audio
|
146 |
+
# :param window_size: the size of the window the logmmse algorithm operates on. A default value
|
147 |
+
# will be picked if left as 0.
|
148 |
+
# :param eta: voice threshold for noise update. While the voice activation detection value is
|
149 |
+
# below this threshold, the noise profile will be continuously updated throughout the audio.
|
150 |
+
# Set to 0 to disable updating the noise profile.
|
151 |
+
# """
|
152 |
+
# wav, dtype = to_float(wav)
|
153 |
+
# wav += np.finfo(np.float64).eps
|
154 |
+
#
|
155 |
+
# if window_size == 0:
|
156 |
+
# window_size = int(math.floor(0.02 * sampling_rate))
|
157 |
+
#
|
158 |
+
# if window_size % 2 == 1:
|
159 |
+
# window_size = window_size + 1
|
160 |
+
#
|
161 |
+
# perc = 50
|
162 |
+
# len1 = int(math.floor(window_size * perc / 100))
|
163 |
+
# len2 = int(window_size - len1)
|
164 |
+
#
|
165 |
+
# win = np.hanning(window_size)
|
166 |
+
# win = win * len2 / np.sum(win)
|
167 |
+
# n_fft = 2 * window_size
|
168 |
+
#
|
169 |
+
# wav_mean = np.zeros(n_fft)
|
170 |
+
# n_frames = len(wav) // window_size
|
171 |
+
# for j in range(0, window_size * n_frames, window_size):
|
172 |
+
# wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0))
|
173 |
+
# noise_mu2 = (wav_mean / n_frames) ** 2
|
174 |
+
#
|
175 |
+
# wav, dtype = to_float(wav)
|
176 |
+
# wav += np.finfo(np.float64).eps
|
177 |
+
#
|
178 |
+
# nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2))
|
179 |
+
# vad = np.zeros(nframes * len2, dtype=np.bool)
|
180 |
+
#
|
181 |
+
# aa = 0.98
|
182 |
+
# mu = 0.98
|
183 |
+
# ksi_min = 10 ** (-25 / 10)
|
184 |
+
#
|
185 |
+
# xk_prev = np.zeros(len1)
|
186 |
+
# noise_mu2 = noise_mu2
|
187 |
+
# for k in range(0, nframes * len2, len2):
|
188 |
+
# insign = win * wav[k:k + window_size]
|
189 |
+
#
|
190 |
+
# spec = np.fft.fft(insign, n_fft, axis=0)
|
191 |
+
# sig = np.absolute(spec)
|
192 |
+
# sig2 = sig ** 2
|
193 |
+
#
|
194 |
+
# gammak = np.minimum(sig2 / noise_mu2, 40)
|
195 |
+
#
|
196 |
+
# if xk_prev.all() == 0:
|
197 |
+
# ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
|
198 |
+
# else:
|
199 |
+
# ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
|
200 |
+
# ksi = np.maximum(ksi_min, ksi)
|
201 |
+
#
|
202 |
+
# log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi)
|
203 |
+
# vad_decision = np.sum(log_sigma_k) / window_size
|
204 |
+
# if vad_decision < eta:
|
205 |
+
# noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
|
206 |
+
# print(vad_decision)
|
207 |
+
#
|
208 |
+
# a = ksi / (1 + ksi)
|
209 |
+
# vk = a * gammak
|
210 |
+
# ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
|
211 |
+
# hw = a * np.exp(ei_vk)
|
212 |
+
# sig = sig * hw
|
213 |
+
# xk_prev = sig ** 2
|
214 |
+
#
|
215 |
+
# vad[k:k + len2] = vad_decision >= eta
|
216 |
+
#
|
217 |
+
# vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant")
|
218 |
+
# return vad
|
219 |
+
|
220 |
+
|
221 |
+
def to_float(_input):
|
222 |
+
if _input.dtype == np.float64:
|
223 |
+
return _input, _input.dtype
|
224 |
+
elif _input.dtype == np.float32:
|
225 |
+
return _input.astype(np.float64), _input.dtype
|
226 |
+
elif _input.dtype == np.uint8:
|
227 |
+
return (_input - 128) / 128., _input.dtype
|
228 |
+
elif _input.dtype == np.int16:
|
229 |
+
return _input / 32768., _input.dtype
|
230 |
+
elif _input.dtype == np.int32:
|
231 |
+
return _input / 2147483648., _input.dtype
|
232 |
+
raise ValueError('Unsupported wave file format')
|
233 |
+
|
234 |
+
|
235 |
+
def from_float(_input, dtype):
|
236 |
+
if dtype == np.float64:
|
237 |
+
return _input, np.float64
|
238 |
+
elif dtype == np.float32:
|
239 |
+
return _input.astype(np.float32)
|
240 |
+
elif dtype == np.uint8:
|
241 |
+
return ((_input * 128) + 128).astype(np.uint8)
|
242 |
+
elif dtype == np.int16:
|
243 |
+
return (_input * 32768).astype(np.int16)
|
244 |
+
elif dtype == np.int32:
|
245 |
+
print(_input)
|
246 |
+
return (_input * 2147483648).astype(np.int32)
|
247 |
+
raise ValueError('Unsupported wave file format')
|
model.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from encoder.params_model import *
|
2 |
+
from encoder.params_data import *
|
3 |
+
from scipy.interpolate import interp1d
|
4 |
+
from sklearn.metrics import roc_curve
|
5 |
+
from torch.nn.utils import clip_grad_norm_
|
6 |
+
from scipy.optimize import brentq
|
7 |
+
from torch import nn
|
8 |
+
import numpy as np
|
9 |
+
import torch
|
10 |
+
|
11 |
+
|
12 |
+
class SpeakerEncoder(nn.Module):
|
13 |
+
def __init__(self, device, loss_device):
|
14 |
+
super().__init__()
|
15 |
+
self.loss_device = loss_device
|
16 |
+
|
17 |
+
# Network defition
|
18 |
+
self.lstm = nn.LSTM(input_size=mel_n_channels,
|
19 |
+
hidden_size=model_hidden_size,
|
20 |
+
num_layers=model_num_layers,
|
21 |
+
batch_first=True).to(device)
|
22 |
+
self.linear = nn.Linear(in_features=model_hidden_size,
|
23 |
+
out_features=model_embedding_size).to(device)
|
24 |
+
self.relu = torch.nn.ReLU().to(device)
|
25 |
+
|
26 |
+
# Cosine similarity scaling (with fixed initial parameter values)
|
27 |
+
self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
|
28 |
+
self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
|
29 |
+
|
30 |
+
# Loss
|
31 |
+
self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
|
32 |
+
|
33 |
+
def do_gradient_ops(self):
|
34 |
+
# Gradient scale
|
35 |
+
self.similarity_weight.grad *= 0.01
|
36 |
+
self.similarity_bias.grad *= 0.01
|
37 |
+
|
38 |
+
# Gradient clipping
|
39 |
+
clip_grad_norm_(self.parameters(), 3, norm_type=2)
|
40 |
+
|
41 |
+
def forward(self, utterances, hidden_init=None):
|
42 |
+
"""
|
43 |
+
Computes the embeddings of a batch of utterance spectrograms.
|
44 |
+
|
45 |
+
:param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
|
46 |
+
(batch_size, n_frames, n_channels)
|
47 |
+
:param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
|
48 |
+
batch_size, hidden_size). Will default to a tensor of zeros if None.
|
49 |
+
:return: the embeddings as a tensor of shape (batch_size, embedding_size)
|
50 |
+
"""
|
51 |
+
# Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
|
52 |
+
# and the final cell state.
|
53 |
+
out, (hidden, cell) = self.lstm(utterances, hidden_init)
|
54 |
+
|
55 |
+
# We take only the hidden state of the last layer
|
56 |
+
embeds_raw = self.relu(self.linear(hidden[-1]))
|
57 |
+
|
58 |
+
# L2-normalize it
|
59 |
+
embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
|
60 |
+
|
61 |
+
return embeds
|
62 |
+
|
63 |
+
def similarity_matrix(self, embeds):
|
64 |
+
"""
|
65 |
+
Computes the similarity matrix according the section 2.1 of GE2E.
|
66 |
+
|
67 |
+
:param embeds: the embeddings as a tensor of shape (speakers_per_batch,
|
68 |
+
utterances_per_speaker, embedding_size)
|
69 |
+
:return: the similarity matrix as a tensor of shape (speakers_per_batch,
|
70 |
+
utterances_per_speaker, speakers_per_batch)
|
71 |
+
"""
|
72 |
+
speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
|
73 |
+
|
74 |
+
# Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
|
75 |
+
centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
|
76 |
+
centroids_incl = centroids_incl.clone() / (torch.norm(centroids_incl, dim=2, keepdim=True) + 1e-5)
|
77 |
+
|
78 |
+
# Exclusive centroids (1 per utterance)
|
79 |
+
centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
|
80 |
+
centroids_excl /= (utterances_per_speaker - 1)
|
81 |
+
centroids_excl = centroids_excl.clone() / (torch.norm(centroids_excl, dim=2, keepdim=True) + 1e-5)
|
82 |
+
|
83 |
+
# Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
|
84 |
+
# product of these vectors (which is just an element-wise multiplication reduced by a sum).
|
85 |
+
# We vectorize the computation for efficiency.
|
86 |
+
sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
|
87 |
+
speakers_per_batch).to(self.loss_device)
|
88 |
+
mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
|
89 |
+
for j in range(speakers_per_batch):
|
90 |
+
mask = np.where(mask_matrix[j])[0]
|
91 |
+
sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
|
92 |
+
sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
|
93 |
+
|
94 |
+
## Even more vectorized version (slower maybe because of transpose)
|
95 |
+
# sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
|
96 |
+
# ).to(self.loss_device)
|
97 |
+
# eye = np.eye(speakers_per_batch, dtype=np.int)
|
98 |
+
# mask = np.where(1 - eye)
|
99 |
+
# sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
|
100 |
+
# mask = np.where(eye)
|
101 |
+
# sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
|
102 |
+
# sim_matrix2 = sim_matrix2.transpose(1, 2)
|
103 |
+
|
104 |
+
sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
|
105 |
+
return sim_matrix
|
106 |
+
|
107 |
+
def loss(self, embeds):
|
108 |
+
"""
|
109 |
+
Computes the softmax loss according the section 2.1 of GE2E.
|
110 |
+
|
111 |
+
:param embeds: the embeddings as a tensor of shape (speakers_per_batch,
|
112 |
+
utterances_per_speaker, embedding_size)
|
113 |
+
:return: the loss and the EER for this batch of embeddings.
|
114 |
+
"""
|
115 |
+
speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
|
116 |
+
|
117 |
+
# Loss
|
118 |
+
sim_matrix = self.similarity_matrix(embeds)
|
119 |
+
sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
|
120 |
+
speakers_per_batch))
|
121 |
+
ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
|
122 |
+
target = torch.from_numpy(ground_truth).long().to(self.loss_device)
|
123 |
+
loss = self.loss_fn(sim_matrix, target)
|
124 |
+
|
125 |
+
# EER (not backpropagated)
|
126 |
+
with torch.no_grad():
|
127 |
+
inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
|
128 |
+
labels = np.array([inv_argmax(i) for i in ground_truth])
|
129 |
+
preds = sim_matrix.detach().cpu().numpy()
|
130 |
+
|
131 |
+
# Snippet from https://yangcha.github.io/EER-ROC/
|
132 |
+
fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
|
133 |
+
eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
|
134 |
+
|
135 |
+
return loss, eer
|
modelutils.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
def check_model_paths(encoder_path: Path, synthesizer_path: Path, vocoder_path: Path):
|
4 |
+
# This function tests the model paths and makes sure at least one is valid.
|
5 |
+
if encoder_path.is_file() or encoder_path.is_dir():
|
6 |
+
return
|
7 |
+
if synthesizer_path.is_file() or synthesizer_path.is_dir():
|
8 |
+
return
|
9 |
+
if vocoder_path.is_file() or vocoder_path.is_dir():
|
10 |
+
return
|
11 |
+
|
12 |
+
# If none of the paths exist, remind the user to download models if needed
|
13 |
+
print("********************************************************************************")
|
14 |
+
print("Error: Model files not found. Follow these instructions to get and install the models:")
|
15 |
+
print("https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models")
|
16 |
+
print("********************************************************************************\n")
|
17 |
+
quit(-1)
|
numbers.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import inflect
|
3 |
+
|
4 |
+
_inflect = inflect.engine()
|
5 |
+
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
|
6 |
+
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
|
7 |
+
_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
|
8 |
+
_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
|
9 |
+
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
|
10 |
+
_number_re = re.compile(r"[0-9]+")
|
11 |
+
|
12 |
+
|
13 |
+
def _remove_commas(m):
|
14 |
+
return m.group(1).replace(",", "")
|
15 |
+
|
16 |
+
|
17 |
+
def _expand_decimal_point(m):
|
18 |
+
return m.group(1).replace(".", " point ")
|
19 |
+
|
20 |
+
|
21 |
+
def _expand_dollars(m):
|
22 |
+
match = m.group(1)
|
23 |
+
parts = match.split(".")
|
24 |
+
if len(parts) > 2:
|
25 |
+
return match + " dollars" # Unexpected format
|
26 |
+
dollars = int(parts[0]) if parts[0] else 0
|
27 |
+
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
28 |
+
if dollars and cents:
|
29 |
+
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
30 |
+
cent_unit = "cent" if cents == 1 else "cents"
|
31 |
+
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
|
32 |
+
elif dollars:
|
33 |
+
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
34 |
+
return "%s %s" % (dollars, dollar_unit)
|
35 |
+
elif cents:
|
36 |
+
cent_unit = "cent" if cents == 1 else "cents"
|
37 |
+
return "%s %s" % (cents, cent_unit)
|
38 |
+
else:
|
39 |
+
return "zero dollars"
|
40 |
+
|
41 |
+
|
42 |
+
def _expand_ordinal(m):
|
43 |
+
return _inflect.number_to_words(m.group(0))
|
44 |
+
|
45 |
+
|
46 |
+
def _expand_number(m):
|
47 |
+
num = int(m.group(0))
|
48 |
+
if num > 1000 and num < 3000:
|
49 |
+
if num == 2000:
|
50 |
+
return "two thousand"
|
51 |
+
elif num > 2000 and num < 2010:
|
52 |
+
return "two thousand " + _inflect.number_to_words(num % 100)
|
53 |
+
elif num % 100 == 0:
|
54 |
+
return _inflect.number_to_words(num // 100) + " hundred"
|
55 |
+
else:
|
56 |
+
return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
|
57 |
+
else:
|
58 |
+
return _inflect.number_to_words(num, andword="")
|
59 |
+
|
60 |
+
|
61 |
+
def normalize_numbers(text):
|
62 |
+
text = re.sub(_comma_number_re, _remove_commas, text)
|
63 |
+
text = re.sub(_pounds_re, r"\1 pounds", text)
|
64 |
+
text = re.sub(_dollars_re, _expand_dollars, text)
|
65 |
+
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
66 |
+
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
67 |
+
text = re.sub(_number_re, _expand_number, text)
|
68 |
+
return text
|
p240_00000.mp3
ADDED
Binary file (20.2 kB). View file
|
|
p260_00000.mp3
ADDED
Binary file (20.5 kB). View file
|
|
params_data.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
## Mel-filterbank
|
3 |
+
mel_window_length = 25 # In milliseconds
|
4 |
+
mel_window_step = 10 # In milliseconds
|
5 |
+
mel_n_channels = 40
|
6 |
+
|
7 |
+
|
8 |
+
## Audio
|
9 |
+
sampling_rate = 16000
|
10 |
+
# Number of spectrogram frames in a partial utterance
|
11 |
+
partials_n_frames = 160 # 1600 ms
|
12 |
+
# Number of spectrogram frames at inference
|
13 |
+
inference_n_frames = 80 # 800 ms
|
14 |
+
|
15 |
+
|
16 |
+
## Voice Activation Detection
|
17 |
+
# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
|
18 |
+
# This sets the granularity of the VAD. Should not need to be changed.
|
19 |
+
vad_window_length = 30 # In milliseconds
|
20 |
+
# Number of frames to average together when performing the moving average smoothing.
|
21 |
+
# The larger this value, the larger the VAD variations must be to not get smoothed out.
|
22 |
+
vad_moving_average_width = 8
|
23 |
+
# Maximum number of consecutive silent frames a segment can have.
|
24 |
+
vad_max_silence_length = 6
|
25 |
+
|
26 |
+
|
27 |
+
## Audio volume normalization
|
28 |
+
audio_norm_target_dBFS = -30
|
29 |
+
|
params_model.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
## Model parameters
|
3 |
+
model_hidden_size = 256
|
4 |
+
model_embedding_size = 256
|
5 |
+
model_num_layers = 3
|
6 |
+
|
7 |
+
|
8 |
+
## Training parameters
|
9 |
+
learning_rate_init = 1e-4
|
10 |
+
speakers_per_batch = 64
|
11 |
+
utterances_per_speaker = 10
|
plot.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib
|
2 |
+
matplotlib.use("Agg")
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
+
def split_title_line(title_text, max_words=5):
|
8 |
+
"""
|
9 |
+
A function that splits any string based on specific character
|
10 |
+
(returning it with the string), with maximum number of words on it
|
11 |
+
"""
|
12 |
+
seq = title_text.split()
|
13 |
+
return "\n".join([" ".join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
|
14 |
+
|
15 |
+
def plot_alignment(alignment, path, title=None, split_title=False, max_len=None):
|
16 |
+
if max_len is not None:
|
17 |
+
alignment = alignment[:, :max_len]
|
18 |
+
|
19 |
+
fig = plt.figure(figsize=(8, 6))
|
20 |
+
ax = fig.add_subplot(111)
|
21 |
+
|
22 |
+
im = ax.imshow(
|
23 |
+
alignment,
|
24 |
+
aspect="auto",
|
25 |
+
origin="lower",
|
26 |
+
interpolation="none")
|
27 |
+
fig.colorbar(im, ax=ax)
|
28 |
+
xlabel = "Decoder timestep"
|
29 |
+
|
30 |
+
if split_title:
|
31 |
+
title = split_title_line(title)
|
32 |
+
|
33 |
+
plt.xlabel(xlabel)
|
34 |
+
plt.title(title)
|
35 |
+
plt.ylabel("Encoder timestep")
|
36 |
+
plt.tight_layout()
|
37 |
+
plt.savefig(path, format="png")
|
38 |
+
plt.close()
|
39 |
+
|
40 |
+
|
41 |
+
def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
|
42 |
+
if max_len is not None:
|
43 |
+
target_spectrogram = target_spectrogram[:max_len]
|
44 |
+
pred_spectrogram = pred_spectrogram[:max_len]
|
45 |
+
|
46 |
+
if split_title:
|
47 |
+
title = split_title_line(title)
|
48 |
+
|
49 |
+
fig = plt.figure(figsize=(10, 8))
|
50 |
+
# Set common labels
|
51 |
+
fig.text(0.5, 0.18, title, horizontalalignment="center", fontsize=16)
|
52 |
+
|
53 |
+
#target spectrogram subplot
|
54 |
+
if target_spectrogram is not None:
|
55 |
+
ax1 = fig.add_subplot(311)
|
56 |
+
ax2 = fig.add_subplot(312)
|
57 |
+
|
58 |
+
if auto_aspect:
|
59 |
+
im = ax1.imshow(np.rot90(target_spectrogram), aspect="auto", interpolation="none")
|
60 |
+
else:
|
61 |
+
im = ax1.imshow(np.rot90(target_spectrogram), interpolation="none")
|
62 |
+
ax1.set_title("Target Mel-Spectrogram")
|
63 |
+
fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax1)
|
64 |
+
ax2.set_title("Predicted Mel-Spectrogram")
|
65 |
+
else:
|
66 |
+
ax2 = fig.add_subplot(211)
|
67 |
+
|
68 |
+
if auto_aspect:
|
69 |
+
im = ax2.imshow(np.rot90(pred_spectrogram), aspect="auto", interpolation="none")
|
70 |
+
else:
|
71 |
+
im = ax2.imshow(np.rot90(pred_spectrogram), interpolation="none")
|
72 |
+
fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax2)
|
73 |
+
|
74 |
+
plt.tight_layout()
|
75 |
+
plt.savefig(path, format="png")
|
76 |
+
plt.close()
|
preprocess.py
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from multiprocessing.pool import Pool
|
2 |
+
from synthesizer import audio
|
3 |
+
from functools import partial
|
4 |
+
from itertools import chain
|
5 |
+
from encoder import inference as encoder
|
6 |
+
from pathlib import Path
|
7 |
+
from utils import logmmse
|
8 |
+
from tqdm import tqdm
|
9 |
+
import numpy as np
|
10 |
+
import librosa
|
11 |
+
|
12 |
+
|
13 |
+
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
14 |
+
skip_existing: bool, hparams, no_alignments: bool,
|
15 |
+
datasets_name: str, subfolders: str):
|
16 |
+
# Gather the input directories
|
17 |
+
dataset_root = datasets_root.joinpath(datasets_name)
|
18 |
+
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
|
19 |
+
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
|
20 |
+
assert all(input_dir.exists() for input_dir in input_dirs)
|
21 |
+
|
22 |
+
# Create the output directories for each output file type
|
23 |
+
out_dir.joinpath("mels").mkdir(exist_ok=True)
|
24 |
+
out_dir.joinpath("audio").mkdir(exist_ok=True)
|
25 |
+
|
26 |
+
# Create a metadata file
|
27 |
+
metadata_fpath = out_dir.joinpath("train.txt")
|
28 |
+
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="cp949")
|
29 |
+
|
30 |
+
# Preprocess the dataset
|
31 |
+
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
32 |
+
func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing,
|
33 |
+
hparams=hparams, no_alignments=no_alignments)
|
34 |
+
job = Pool(n_processes).imap(func, speaker_dirs)
|
35 |
+
for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
|
36 |
+
for metadatum in speaker_metadata:
|
37 |
+
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
|
38 |
+
metadata_file.close()
|
39 |
+
|
40 |
+
# Verify the contents of the metadata file
|
41 |
+
with metadata_fpath.open("r", encoding="cp949") as metadata_file:
|
42 |
+
metadata = [line.split("|") for line in metadata_file]
|
43 |
+
mel_frames = sum([int(m[4]) for m in metadata])
|
44 |
+
timesteps = sum([int(m[3]) for m in metadata])
|
45 |
+
sample_rate = hparams.sample_rate
|
46 |
+
hours = (timesteps / sample_rate) / 3600
|
47 |
+
print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
|
48 |
+
(len(metadata), mel_frames, timesteps, hours))
|
49 |
+
print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
|
50 |
+
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
51 |
+
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
52 |
+
|
53 |
+
|
54 |
+
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
|
55 |
+
metadata = []
|
56 |
+
for book_dir in speaker_dir.glob("*"):
|
57 |
+
if no_alignments:
|
58 |
+
# Gather the utterance audios and texts
|
59 |
+
# LibriTTS uses .wav but we will include extensions for compatibility with other datasets
|
60 |
+
extensions = ["*.wav", "*.flac", "*.mp3"]
|
61 |
+
for extension in extensions:
|
62 |
+
wav_fpaths = book_dir.glob(extension)
|
63 |
+
|
64 |
+
for wav_fpath in wav_fpaths:
|
65 |
+
# Load the audio waveform
|
66 |
+
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
67 |
+
if hparams.rescale:
|
68 |
+
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
69 |
+
|
70 |
+
# Get the corresponding text
|
71 |
+
# Check for .txt (for compatibility with other datasets)
|
72 |
+
text_fpath = wav_fpath.with_suffix(".txt")
|
73 |
+
if not text_fpath.exists():
|
74 |
+
# Check for .normalized.txt (LibriTTS)
|
75 |
+
text_fpath = wav_fpath.with_suffix(".normalized.txt")
|
76 |
+
assert text_fpath.exists()
|
77 |
+
with text_fpath.open("r") as text_file:
|
78 |
+
text = "".join([line for line in text_file])
|
79 |
+
text = text.replace("\"", "")
|
80 |
+
text = text.strip()
|
81 |
+
|
82 |
+
# Process the utterance
|
83 |
+
metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
|
84 |
+
skip_existing, hparams))
|
85 |
+
else:
|
86 |
+
# Process alignment file (LibriSpeech support)
|
87 |
+
# Gather the utterance audios and texts
|
88 |
+
try:
|
89 |
+
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
|
90 |
+
with alignments_fpath.open("r") as alignments_file:
|
91 |
+
alignments = [line.rstrip().split(" ") for line in alignments_file]
|
92 |
+
except StopIteration:
|
93 |
+
# A few alignment files will be missing
|
94 |
+
continue
|
95 |
+
|
96 |
+
# Iterate over each entry in the alignments file
|
97 |
+
for wav_fname, words, end_times in alignments:
|
98 |
+
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
|
99 |
+
assert wav_fpath.exists()
|
100 |
+
words = words.replace("\"", "").split(",")
|
101 |
+
end_times = list(map(float, end_times.replace("\"", "").split(",")))
|
102 |
+
|
103 |
+
# Process each sub-utterance
|
104 |
+
wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
|
105 |
+
for i, (wav, text) in enumerate(zip(wavs, texts)):
|
106 |
+
sub_basename = "%s_%02d" % (wav_fname, i)
|
107 |
+
metadata.append(process_utterance(wav, text, out_dir, sub_basename,
|
108 |
+
skip_existing, hparams))
|
109 |
+
|
110 |
+
return [m for m in metadata if m is not None]
|
111 |
+
|
112 |
+
|
113 |
+
def split_on_silences(wav_fpath, words, end_times, hparams):
|
114 |
+
# Load the audio waveform
|
115 |
+
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
116 |
+
if hparams.rescale:
|
117 |
+
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
118 |
+
|
119 |
+
words = np.array(words)
|
120 |
+
start_times = np.array([0.0] + end_times[:-1])
|
121 |
+
end_times = np.array(end_times)
|
122 |
+
assert len(words) == len(end_times) == len(start_times)
|
123 |
+
assert words[0] == "" and words[-1] == ""
|
124 |
+
|
125 |
+
# Find pauses that are too long
|
126 |
+
mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
|
127 |
+
mask[0] = mask[-1] = True
|
128 |
+
breaks = np.where(mask)[0]
|
129 |
+
|
130 |
+
# Profile the noise from the silences and perform noise reduction on the waveform
|
131 |
+
silence_times = [[start_times[i], end_times[i]] for i in breaks]
|
132 |
+
silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
|
133 |
+
noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
|
134 |
+
if len(noisy_wav) > hparams.sample_rate * 0.02:
|
135 |
+
profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
|
136 |
+
wav = logmmse.denoise(wav, profile, eta=0)
|
137 |
+
|
138 |
+
# Re-attach segments that are too short
|
139 |
+
segments = list(zip(breaks[:-1], breaks[1:]))
|
140 |
+
segment_durations = [start_times[end] - end_times[start] for start, end in segments]
|
141 |
+
i = 0
|
142 |
+
while i < len(segments) and len(segments) > 1:
|
143 |
+
if segment_durations[i] < hparams.utterance_min_duration:
|
144 |
+
# See if the segment can be re-attached with the right or the left segment
|
145 |
+
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
|
146 |
+
right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
|
147 |
+
joined_duration = segment_durations[i] + min(left_duration, right_duration)
|
148 |
+
|
149 |
+
# Do not re-attach if it causes the joined utterance to be too long
|
150 |
+
if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
|
151 |
+
i += 1
|
152 |
+
continue
|
153 |
+
|
154 |
+
# Re-attach the segment with the neighbour of shortest duration
|
155 |
+
j = i - 1 if left_duration <= right_duration else i
|
156 |
+
segments[j] = (segments[j][0], segments[j + 1][1])
|
157 |
+
segment_durations[j] = joined_duration
|
158 |
+
del segments[j + 1], segment_durations[j + 1]
|
159 |
+
else:
|
160 |
+
i += 1
|
161 |
+
|
162 |
+
# Split the utterance
|
163 |
+
segment_times = [[end_times[start], start_times[end]] for start, end in segments]
|
164 |
+
segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
|
165 |
+
wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
|
166 |
+
texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]
|
167 |
+
|
168 |
+
# # DEBUG: play the audio segments (run with -n=1)
|
169 |
+
# import sounddevice as sd
|
170 |
+
# if len(wavs) > 1:
|
171 |
+
# print("This sentence was split in %d segments:" % len(wavs))
|
172 |
+
# else:
|
173 |
+
# print("There are no silences long enough for this sentence to be split:")
|
174 |
+
# for wav, text in zip(wavs, texts):
|
175 |
+
# # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
|
176 |
+
# # when playing them. You shouldn't need to do that in your parsers.
|
177 |
+
# wav = np.concatenate((wav, [0] * 16000))
|
178 |
+
# print("\t%s" % text)
|
179 |
+
# sd.play(wav, 16000, blocking=True)
|
180 |
+
# print("")
|
181 |
+
|
182 |
+
return wavs, texts
|
183 |
+
|
184 |
+
|
185 |
+
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
186 |
+
skip_existing: bool, hparams):
|
187 |
+
## FOR REFERENCE:
|
188 |
+
# For you not to lose your head if you ever wish to change things here or implement your own
|
189 |
+
# synthesizer.
|
190 |
+
# - Both the audios and the mel spectrograms are saved as numpy arrays
|
191 |
+
# - There is no processing done to the audios that will be saved to disk beyond volume
|
192 |
+
# normalization (in split_on_silences)
|
193 |
+
# - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
|
194 |
+
# is why we re-apply it on the audio on the side of the vocoder.
|
195 |
+
# - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
|
196 |
+
# without extra padding. This means that you won't have an exact relation between the length
|
197 |
+
# of the wav and of the mel spectrogram. See the vocoder data loader.
|
198 |
+
|
199 |
+
|
200 |
+
# Skip existing utterances if needed
|
201 |
+
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
|
202 |
+
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
|
203 |
+
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
|
204 |
+
return None
|
205 |
+
|
206 |
+
# Trim silence
|
207 |
+
if hparams.trim_silence:
|
208 |
+
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
|
209 |
+
|
210 |
+
# Skip utterances that are too short
|
211 |
+
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
|
212 |
+
return None
|
213 |
+
|
214 |
+
# Compute the mel spectrogram
|
215 |
+
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
216 |
+
mel_frames = mel_spectrogram.shape[1]
|
217 |
+
|
218 |
+
# Skip utterances that are too long
|
219 |
+
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
|
220 |
+
return None
|
221 |
+
|
222 |
+
# Write the spectrogram, embed and audio to disk
|
223 |
+
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
|
224 |
+
np.save(wav_fpath, wav, allow_pickle=False)
|
225 |
+
|
226 |
+
# Return a tuple describing this training example
|
227 |
+
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
|
228 |
+
|
229 |
+
|
230 |
+
def embed_utterance(fpaths, encoder_model_fpath):
|
231 |
+
if not encoder.is_loaded():
|
232 |
+
encoder.load_model(encoder_model_fpath)
|
233 |
+
|
234 |
+
# Compute the speaker embedding of the utterance
|
235 |
+
wav_fpath, embed_fpath = fpaths
|
236 |
+
wav = np.load(wav_fpath)
|
237 |
+
wav = encoder.preprocess_wav(wav)
|
238 |
+
embed = encoder.embed_utterance(wav)
|
239 |
+
np.save(embed_fpath, embed, allow_pickle=False)
|
240 |
+
|
241 |
+
|
242 |
+
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
|
243 |
+
wav_dir = synthesizer_root.joinpath("audio")
|
244 |
+
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
245 |
+
assert wav_dir.exists() and metadata_fpath.exists()
|
246 |
+
embed_dir = synthesizer_root.joinpath("embeds")
|
247 |
+
embed_dir.mkdir(exist_ok=True)
|
248 |
+
|
249 |
+
# Gather the input wave filepath and the target output embed filepath
|
250 |
+
with metadata_fpath.open("r",encoding='cp949') as metadata_file:
|
251 |
+
metadata = [line.split("|") for line in metadata_file]
|
252 |
+
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
|
253 |
+
|
254 |
+
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
255 |
+
# Embed the utterances in separate threads
|
256 |
+
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
257 |
+
job = Pool(n_processes).imap(func, fpaths)
|
258 |
+
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
259 |
+
|
preprocess_kspon.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils.argutils import print_args
|
2 |
+
from pathlib import Path
|
3 |
+
import argparse
|
4 |
+
import sys
|
5 |
+
import wave
|
6 |
+
import os
|
7 |
+
from itertools import chain
|
8 |
+
from tqdm import tqdm
|
9 |
+
import re
|
10 |
+
|
11 |
+
def preprocess_kspon(input_dirs):
|
12 |
+
folders = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
13 |
+
|
14 |
+
for folder in tqdm(folders, "folders", len(folders), unit="folders"):
|
15 |
+
texts = list()
|
16 |
+
symbol = ["o/", "b/", "l/", "n/", "u/", "+", "*", "(", "/"]
|
17 |
+
punctuation = [" ", ".", "?", "!"]
|
18 |
+
white = [" ", " ", ",,", ",,,"]
|
19 |
+
|
20 |
+
existing_fnames = list()
|
21 |
+
for file in folder.glob("*"):
|
22 |
+
existing_fnames.append(file)
|
23 |
+
|
24 |
+
|
25 |
+
if str(file).endswith(".txt") and not str(file).endswith("alignment.txt"):
|
26 |
+
s = os.path.splitext(file) # 확장자와 확장자 아닌부분
|
27 |
+
s = os.path.split(s[0]) # 확장자아닌 부분에서 분리
|
28 |
+
|
29 |
+
with open(file, "r", encoding='cp949') as f:
|
30 |
+
texts.append(s[1] + "$\"" + "|" + " ".join(f.read().splitlines()) + "|" + "\"\n")
|
31 |
+
|
32 |
+
for i, text in enumerate(texts):
|
33 |
+
text = re.sub('\)\/\([가-힣\s\w]*\)', "", text)
|
34 |
+
for sym in symbol:
|
35 |
+
text = text.replace(sym, "")
|
36 |
+
for pun in punctuation:
|
37 |
+
text = text.replace(pun, " ")
|
38 |
+
for wh in white:
|
39 |
+
text = text.replace(wh, ",")
|
40 |
+
text = text.replace("$", " ")
|
41 |
+
text = text.replace("|", ",")
|
42 |
+
text = text.replace(",,", ",")
|
43 |
+
texts[i] = text
|
44 |
+
with open(os.path.join(folder, os.path.basename(folder) + "_alignment.txt"), "w", encoding='cp949') as a:
|
45 |
+
for text in texts:
|
46 |
+
a.write(text)
|
47 |
+
|
48 |
+
|
49 |
+
if __name__ == "__main__":
|
50 |
+
parser = argparse.ArgumentParser(
|
51 |
+
description="pcm, raw 확장자 파일을 wav확장자로 변환",
|
52 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
53 |
+
)
|
54 |
+
|
55 |
+
parser.add_argument("path", type=str, help="처리할 폴더 경로")
|
56 |
+
args = parser.parse_args()
|
57 |
+
|
58 |
+
dataset_root = Path(args.path)
|
59 |
+
input_dirs = [dataset_root.joinpath("KsponSpeech_01"),
|
60 |
+
dataset_root.joinpath("KsponSpeech_02"),
|
61 |
+
dataset_root.joinpath("KsponSpeech_03"),
|
62 |
+
dataset_root.joinpath("KsponSpeech_04"),
|
63 |
+
dataset_root.joinpath("KsponSpeech_05")]
|
64 |
+
|
65 |
+
preprocess_kspon(input_dirs)
|
profiler.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from time import perf_counter as timer
|
2 |
+
from collections import OrderedDict
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class Profiler:
|
7 |
+
def __init__(self, summarize_every=5, disabled=False):
|
8 |
+
self.last_tick = timer()
|
9 |
+
self.logs = OrderedDict()
|
10 |
+
self.summarize_every = summarize_every
|
11 |
+
self.disabled = disabled
|
12 |
+
|
13 |
+
def tick(self, name):
|
14 |
+
if self.disabled:
|
15 |
+
return
|
16 |
+
|
17 |
+
# Log the time needed to execute that function
|
18 |
+
if not name in self.logs:
|
19 |
+
self.logs[name] = []
|
20 |
+
if len(self.logs[name]) >= self.summarize_every:
|
21 |
+
self.summarize()
|
22 |
+
self.purge_logs()
|
23 |
+
self.logs[name].append(timer() - self.last_tick)
|
24 |
+
|
25 |
+
self.reset_timer()
|
26 |
+
|
27 |
+
def purge_logs(self):
|
28 |
+
for name in self.logs:
|
29 |
+
self.logs[name].clear()
|
30 |
+
|
31 |
+
def reset_timer(self):
|
32 |
+
self.last_tick = timer()
|
33 |
+
|
34 |
+
def summarize(self):
|
35 |
+
n = max(map(len, self.logs.values()))
|
36 |
+
assert n == self.summarize_every
|
37 |
+
print("\nAverage execution time over %d steps:" % n)
|
38 |
+
|
39 |
+
name_msgs = ["%s (%d/%d):" % (name, len(deltas), n) for name, deltas in self.logs.items()]
|
40 |
+
pad = max(map(len, name_msgs))
|
41 |
+
for name_msg, deltas in zip(name_msgs, self.logs.values()):
|
42 |
+
print(" %s mean: %4.0fms std: %4.0fms" %
|
43 |
+
(name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000))
|
44 |
+
print("", flush=True)
|
45 |
+
|
random_cycler.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
class RandomCycler:
|
4 |
+
"""
|
5 |
+
Creates an internal copy of a sequence and allows access to its items in a constrained random
|
6 |
+
order. For a source sequence of n items and one or several consecutive queries of a total
|
7 |
+
of m items, the following guarantees hold (one implies the other):
|
8 |
+
- Each item will be returned between m // n and ((m - 1) // n) + 1 times.
|
9 |
+
- Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self, source):
|
13 |
+
if len(source) == 0:
|
14 |
+
raise Exception("Can't create RandomCycler from an empty collection")
|
15 |
+
self.all_items = list(source)
|
16 |
+
self.next_items = []
|
17 |
+
|
18 |
+
def sample(self, count: int):
|
19 |
+
shuffle = lambda l: random.sample(l, len(l))
|
20 |
+
|
21 |
+
out = []
|
22 |
+
while count > 0:
|
23 |
+
if count >= len(self.all_items):
|
24 |
+
out.extend(shuffle(list(self.all_items)))
|
25 |
+
count -= len(self.all_items)
|
26 |
+
continue
|
27 |
+
n = min(count, len(self.next_items))
|
28 |
+
out.extend(self.next_items[:n])
|
29 |
+
count -= n
|
30 |
+
self.next_items = self.next_items[n:]
|
31 |
+
if len(self.next_items) == 0:
|
32 |
+
self.next_items = shuffle(list(self.all_items))
|
33 |
+
return out
|
34 |
+
|
35 |
+
def __next__(self):
|
36 |
+
return self.sample(1)[0]
|
37 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
nltk
|
3 |
+
umap-learn
|
4 |
+
visdom
|
5 |
+
librosa>=0.8.0
|
6 |
+
matplotlib>=3.3.0
|
7 |
+
numpy==1.19.3; platform_system == "Windows"
|
8 |
+
numpy==1.19.4; platform_system != "Windows"
|
9 |
+
scipy>=1.0.0
|
10 |
+
tqdm
|
11 |
+
sounddevice
|
12 |
+
SoundFile
|
13 |
+
Unidecode
|
14 |
+
inflect
|
15 |
+
PyQt5
|
16 |
+
multiprocess
|
17 |
+
numba
|
18 |
+
webrtcvad; platform_system != "Windows"
|
19 |
+
jamo
|
speaker.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from encoder.data_objects.random_cycler import RandomCycler
|
2 |
+
from encoder.data_objects.utterance import Utterance
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
# Contains the set of utterances of a single speaker
|
6 |
+
class Speaker:
|
7 |
+
def __init__(self, root: Path):
|
8 |
+
self.root = root
|
9 |
+
self.name = root.name
|
10 |
+
self.utterances = None
|
11 |
+
self.utterance_cycler = None
|
12 |
+
|
13 |
+
def _load_utterances(self):
|
14 |
+
with self.root.joinpath("_sources.txt").open("r") as sources_file:
|
15 |
+
sources = [l.split(",") for l in sources_file]
|
16 |
+
sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
|
17 |
+
self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
|
18 |
+
self.utterance_cycler = RandomCycler(self.utterances)
|
19 |
+
|
20 |
+
def random_partial(self, count, n_frames):
|
21 |
+
"""
|
22 |
+
Samples a batch of <count> unique partial utterances from the disk in a way that all
|
23 |
+
utterances come up at least once every two cycles and in a random order every time.
|
24 |
+
|
25 |
+
:param count: The number of partial utterances to sample from the set of utterances from
|
26 |
+
that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than
|
27 |
+
the number of utterances available.
|
28 |
+
:param n_frames: The number of frames in the partial utterance.
|
29 |
+
:return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
|
30 |
+
frames are the frames of the partial utterances and range is the range of the partial
|
31 |
+
utterance with regard to the complete utterance.
|
32 |
+
"""
|
33 |
+
if self.utterances is None:
|
34 |
+
self._load_utterances()
|
35 |
+
|
36 |
+
utterances = self.utterance_cycler.sample(count)
|
37 |
+
|
38 |
+
a = [(u,) + u.random_partial(n_frames) for u in utterances]
|
39 |
+
|
40 |
+
return a
|
speaker_batch.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from typing import List
|
3 |
+
from encoder.data_objects.speaker import Speaker
|
4 |
+
|
5 |
+
class SpeakerBatch:
|
6 |
+
def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
|
7 |
+
self.speakers = speakers
|
8 |
+
self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
|
9 |
+
|
10 |
+
# Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
|
11 |
+
# 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
|
12 |
+
self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
|
speaker_verification_dataset.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from encoder.data_objects.random_cycler import RandomCycler
|
2 |
+
from encoder.data_objects.speaker_batch import SpeakerBatch
|
3 |
+
from encoder.data_objects.speaker import Speaker
|
4 |
+
from encoder.params_data import partials_n_frames
|
5 |
+
from torch.utils.data import Dataset, DataLoader
|
6 |
+
from pathlib import Path
|
7 |
+
|
8 |
+
# TODO: improve with a pool of speakers for data efficiency
|
9 |
+
|
10 |
+
class SpeakerVerificationDataset(Dataset):
|
11 |
+
def __init__(self, datasets_root: Path):
|
12 |
+
self.root = datasets_root
|
13 |
+
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
|
14 |
+
if len(speaker_dirs) == 0:
|
15 |
+
raise Exception("No speakers found. Make sure you are pointing to the directory "
|
16 |
+
"containing all preprocessed speaker directories.")
|
17 |
+
self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
|
18 |
+
self.speaker_cycler = RandomCycler(self.speakers)
|
19 |
+
|
20 |
+
def __len__(self):
|
21 |
+
return int(1e10)
|
22 |
+
|
23 |
+
def __getitem__(self, index):
|
24 |
+
return next(self.speaker_cycler)
|
25 |
+
|
26 |
+
def get_logs(self):
|
27 |
+
log_string = ""
|
28 |
+
for log_fpath in self.root.glob("*.txt"):
|
29 |
+
with log_fpath.open("r") as log_file:
|
30 |
+
log_string += "".join(log_file.readlines())
|
31 |
+
return log_string
|
32 |
+
|
33 |
+
|
34 |
+
class SpeakerVerificationDataLoader(DataLoader):
|
35 |
+
def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
|
36 |
+
batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
|
37 |
+
worker_init_fn=None):
|
38 |
+
self.utterances_per_speaker = utterances_per_speaker
|
39 |
+
|
40 |
+
super().__init__(
|
41 |
+
dataset=dataset,
|
42 |
+
batch_size=speakers_per_batch,
|
43 |
+
shuffle=False,
|
44 |
+
sampler=sampler,
|
45 |
+
batch_sampler=batch_sampler,
|
46 |
+
num_workers=num_workers,
|
47 |
+
collate_fn=self.collate,
|
48 |
+
pin_memory=pin_memory,
|
49 |
+
drop_last=False,
|
50 |
+
timeout=timeout,
|
51 |
+
worker_init_fn=worker_init_fn
|
52 |
+
)
|
53 |
+
|
54 |
+
def collate(self, speakers):
|
55 |
+
return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)
|
56 |
+
|
symbols.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Defines the set of symbols used in text input to the model.
|
3 |
+
|
4 |
+
The default is a set of ASCII characters that works well for English or text that has been run
|
5 |
+
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
|
6 |
+
"""
|
7 |
+
# from . import cmudict
|
8 |
+
from .korean import ALL_SYMBOLS, PAD, EOS
|
9 |
+
|
10 |
+
|
11 |
+
# For english
|
12 |
+
en_symbols = PAD+EOS+'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' #<-For deployment(Because korean ALL_SYMBOLS follow this convention)
|
13 |
+
|
14 |
+
symbols = ALL_SYMBOLS # for korean
|
15 |
+
#
|
16 |
+
# # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
17 |
+
# #_arpabet = ["@' + s for s in cmudict.valid_symbols]
|
18 |
+
#
|
19 |
+
# # Export all symbols:
|
20 |
+
# symbols = [PAD, EOS] + list(_characters) #+ _arpabet
|
synthesize.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.utils.data import DataLoader
|
3 |
+
from synthesizer.hparams import hparams_debug_string
|
4 |
+
from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
|
5 |
+
from synthesizer.models.tacotron import Tacotron
|
6 |
+
from synthesizer.utils.text import text_to_sequence
|
7 |
+
from synthesizer.utils.symbols import symbols
|
8 |
+
import numpy as np
|
9 |
+
from pathlib import Path
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
|
13 |
+
def run_synthesis(in_dir, out_dir, model_dir, hparams):
|
14 |
+
# This generates ground truth-aligned mels for vocoder training
|
15 |
+
synth_dir = Path(out_dir).joinpath("mels_gta")
|
16 |
+
synth_dir.mkdir(exist_ok=True)
|
17 |
+
print(hparams_debug_string(hparams))
|
18 |
+
|
19 |
+
# Check for GPU
|
20 |
+
if torch.cuda.is_available():
|
21 |
+
device = torch.device("cuda")
|
22 |
+
if hparams.synthesis_batch_size % torch.cuda.device_count() != 0:
|
23 |
+
raise ValueError("`hparams.synthesis_batch_size` must be evenly divisible by n_gpus!")
|
24 |
+
else:
|
25 |
+
device = torch.device("cpu")
|
26 |
+
print("Synthesizer using device:", device)
|
27 |
+
|
28 |
+
# Instantiate Tacotron model
|
29 |
+
model = Tacotron(embed_dims=hparams.tts_embed_dims,
|
30 |
+
num_chars=len(symbols),
|
31 |
+
encoder_dims=hparams.tts_encoder_dims,
|
32 |
+
decoder_dims=hparams.tts_decoder_dims,
|
33 |
+
n_mels=hparams.num_mels,
|
34 |
+
fft_bins=hparams.num_mels,
|
35 |
+
postnet_dims=hparams.tts_postnet_dims,
|
36 |
+
encoder_K=hparams.tts_encoder_K,
|
37 |
+
lstm_dims=hparams.tts_lstm_dims,
|
38 |
+
postnet_K=hparams.tts_postnet_K,
|
39 |
+
num_highways=hparams.tts_num_highways,
|
40 |
+
dropout=0., # Use zero dropout for gta mels
|
41 |
+
stop_threshold=hparams.tts_stop_threshold,
|
42 |
+
speaker_embedding_size=hparams.speaker_embedding_size).to(device)
|
43 |
+
|
44 |
+
# Load the weights
|
45 |
+
model_dir = Path(model_dir)
|
46 |
+
model_fpath = model_dir.joinpath(model_dir.stem).with_suffix(".pt")
|
47 |
+
print("\nLoading weights at %s" % model_fpath)
|
48 |
+
model.load(model_fpath)
|
49 |
+
print("Tacotron weights loaded from step %d" % model.step)
|
50 |
+
|
51 |
+
# Synthesize using same reduction factor as the model is currently trained
|
52 |
+
r = np.int32(model.r)
|
53 |
+
|
54 |
+
# Set model to eval mode (disable gradient and zoneout)
|
55 |
+
model.eval()
|
56 |
+
|
57 |
+
# Initialize the dataset
|
58 |
+
in_dir = Path(in_dir)
|
59 |
+
metadata_fpath = in_dir.joinpath("train.txt")
|
60 |
+
mel_dir = in_dir.joinpath("mels")
|
61 |
+
embed_dir = in_dir.joinpath("embeds")
|
62 |
+
|
63 |
+
dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
|
64 |
+
data_loader = DataLoader(dataset,
|
65 |
+
collate_fn=lambda batch: collate_synthesizer(batch, r,hparams),
|
66 |
+
batch_size=hparams.synthesis_batch_size,
|
67 |
+
num_workers=0, #Having an error(Can't pickle local object 'run_synthesis.<locals>.<lambda>') when training in Windows unless you set num_workers=0
|
68 |
+
shuffle=False,
|
69 |
+
pin_memory=True)
|
70 |
+
|
71 |
+
# Generate GTA mels
|
72 |
+
meta_out_fpath = Path(out_dir).joinpath("synthesized.txt")
|
73 |
+
with open(meta_out_fpath, "w") as file:
|
74 |
+
for i, (texts, mels, embeds, idx) in tqdm(enumerate(data_loader), total=len(data_loader)):
|
75 |
+
texts = texts.to(device)
|
76 |
+
mels = mels.to(device)
|
77 |
+
embeds = embeds.to(device)
|
78 |
+
|
79 |
+
# Parallelize model onto GPUS using workaround due to python bug
|
80 |
+
if device.type == "cuda" and torch.cuda.device_count() > 1:
|
81 |
+
_, mels_out,_,_ = data_parallel_workaround(model, texts, mels, embeds)
|
82 |
+
else:
|
83 |
+
_,mels_out, _,_ = model(texts, mels, embeds)
|
84 |
+
|
85 |
+
for j, k in enumerate(idx):
|
86 |
+
# Note: outputs mel-spectrogram files and target ones have same names, just different folders
|
87 |
+
mel_filename = Path(synth_dir).joinpath(dataset.metadata[k][1])
|
88 |
+
mel_out = mels_out[j].detach().cpu().numpy().T
|
89 |
+
|
90 |
+
# Use the length of the ground truth mel to remove padding from the generated mels
|
91 |
+
mel_out = mel_out[:int(dataset.metadata[k][4])]
|
92 |
+
|
93 |
+
# Write the spectrogram to disk
|
94 |
+
np.save(mel_filename, mel_out, allow_pickle=False)
|
95 |
+
|
96 |
+
# Write metadata into the synthesized file
|
97 |
+
file.write("|".join(dataset.metadata[k]))
|
synthesizer_dataset.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.utils.data import Dataset
|
3 |
+
import numpy as np
|
4 |
+
from pathlib import Path
|
5 |
+
from synthesizer.utils.text import text_to_sequence
|
6 |
+
import nltk
|
7 |
+
nltk.download('punkt')
|
8 |
+
##
|
9 |
+
|
10 |
+
class SynthesizerDataset(Dataset):
|
11 |
+
def __init__(self, metadata_fpath: Path, mel_dir: Path, embed_dir: Path, hparams):
|
12 |
+
print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, embed_dir))
|
13 |
+
|
14 |
+
with metadata_fpath.open("r",encoding="cp949") as metadata_file:
|
15 |
+
metadata = [line.split("|") for line in metadata_file]
|
16 |
+
|
17 |
+
mel_fnames = [x[1] for x in metadata if int(x[4])]
|
18 |
+
mel_fpaths = [mel_dir.joinpath(fname) for fname in mel_fnames]
|
19 |
+
embed_fnames = [x[2] for x in metadata if int(x[4])]
|
20 |
+
embed_fpaths = [embed_dir.joinpath(fname) for fname in embed_fnames]
|
21 |
+
self.samples_fpaths = list(zip(mel_fpaths, embed_fpaths))
|
22 |
+
self.samples_texts = [x[5].strip() for x in metadata if int(x[4])]
|
23 |
+
self.metadata = metadata
|
24 |
+
self.hparams = hparams
|
25 |
+
|
26 |
+
print("Found %d samples" % len(self.samples_fpaths))
|
27 |
+
|
28 |
+
def __getitem__(self, index):
|
29 |
+
# Sometimes index may be a list of 2 (not sure why this happens)
|
30 |
+
# If that is the case, return a single item corresponding to first element in index
|
31 |
+
if index is list:
|
32 |
+
index = index[0]
|
33 |
+
|
34 |
+
mel_path, embed_path = self.samples_fpaths[index]
|
35 |
+
mel = np.load(mel_path).T.astype(np.float32)
|
36 |
+
|
37 |
+
# Load the embed
|
38 |
+
embed = np.load(embed_path)
|
39 |
+
|
40 |
+
print(self.samples_texts[index])
|
41 |
+
|
42 |
+
# Get the text and clean it
|
43 |
+
text = text_to_sequence(self.samples_texts[index], self.hparams.tts_cleaner_names)
|
44 |
+
|
45 |
+
# Convert the list returned by text_to_sequence to a numpy array
|
46 |
+
text = np.asarray(text).astype(np.int32)
|
47 |
+
|
48 |
+
return text, mel.astype(np.float32), embed.astype(np.float32), index
|
49 |
+
|
50 |
+
def __len__(self):
|
51 |
+
return len(self.samples_fpaths)
|
52 |
+
|
53 |
+
|
54 |
+
def collate_synthesizer(batch, r, hparams):
|
55 |
+
# Text
|
56 |
+
x_lens = [len(x[0]) for x in batch]
|
57 |
+
max_x_len = max(x_lens)
|
58 |
+
|
59 |
+
chars = [pad1d(x[0], max_x_len) for x in batch]
|
60 |
+
chars = np.stack(chars)
|
61 |
+
|
62 |
+
# Mel spectrogram
|
63 |
+
spec_lens = [x[1].shape[-1] for x in batch]
|
64 |
+
max_spec_len = max(spec_lens) + 1
|
65 |
+
if max_spec_len % r != 0:
|
66 |
+
max_spec_len += r - max_spec_len % r
|
67 |
+
|
68 |
+
# WaveRNN mel spectrograms are normalized to [0, 1] so zero padding adds silence
|
69 |
+
# By default, SV2TTS uses symmetric mels, where -1*max_abs_value is silence.
|
70 |
+
if hparams.symmetric_mels:
|
71 |
+
mel_pad_value = -1 * hparams.max_abs_value
|
72 |
+
else:
|
73 |
+
mel_pad_value = 0
|
74 |
+
|
75 |
+
mel = [pad2d(x[1], max_spec_len, pad_value=mel_pad_value) for x in batch]
|
76 |
+
mel = np.stack(mel)
|
77 |
+
|
78 |
+
# Speaker embedding (SV2TTS)
|
79 |
+
embeds = [x[2] for x in batch]
|
80 |
+
|
81 |
+
# Index (for vocoder preprocessing)
|
82 |
+
indices = [x[3] for x in batch]
|
83 |
+
|
84 |
+
|
85 |
+
# Convert all to tensor
|
86 |
+
chars = torch.tensor(chars).long()
|
87 |
+
mel = torch.tensor(mel)
|
88 |
+
embeds = torch.tensor(embeds)
|
89 |
+
|
90 |
+
return chars, mel, embeds, indices
|
91 |
+
|
92 |
+
def pad1d(x, max_len, pad_value=0):
|
93 |
+
return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
|
94 |
+
|
95 |
+
def pad2d(x, max_len, pad_value=0):
|
96 |
+
return np.pad(x, ((0, 0), (0, max_len - x.shape[-1])), mode="constant", constant_values=pad_value)
|
synthesizer_preprocess_audio.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from synthesizer.Kor_preprocess import preprocess_KSponSpeech
|
2 |
+
from synthesizer.hparams import hparams
|
3 |
+
from utils.argutils import print_args
|
4 |
+
from pathlib import Path
|
5 |
+
import argparse
|
6 |
+
|
7 |
+
|
8 |
+
if __name__ == "__main__":
|
9 |
+
parser = argparse.ArgumentParser(
|
10 |
+
description="Preprocesses audio files from datasets, encodes them as mel spectrograms "
|
11 |
+
"and writes them to the disk. Audio files are also saved, to be used by the "
|
12 |
+
"vocoder for training.",
|
13 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
14 |
+
)
|
15 |
+
parser.add_argument("datasets_root", type=Path, help=\
|
16 |
+
"Path to the directory containing your LibriSpeech/TTS datasets.")
|
17 |
+
parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
|
18 |
+
"Path to the output directory that will contain the mel spectrograms, the audios and the "
|
19 |
+
"embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/")
|
20 |
+
parser.add_argument("-n", "--n_processes", type=int, default=None, help=\
|
21 |
+
"Number of processes in parallel.")
|
22 |
+
parser.add_argument("-s", "--skip_existing", action="store_true", help=\
|
23 |
+
"Whether to overwrite existing files with the same name. Useful if the preprocessing was "
|
24 |
+
"interrupted.")
|
25 |
+
parser.add_argument("--hparams", type=str, default="", help=\
|
26 |
+
"Hyperparameter overrides as a comma-separated list of name-value pairs")
|
27 |
+
args = parser.parse_args()
|
28 |
+
|
29 |
+
# Process the arguments
|
30 |
+
if not hasattr(args, "out_dir"):
|
31 |
+
args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
|
32 |
+
|
33 |
+
# Create directories
|
34 |
+
assert args.datasets_root.exists()
|
35 |
+
args.out_dir.mkdir(exist_ok=True, parents=True)
|
36 |
+
|
37 |
+
# Preprocess the dataset
|
38 |
+
print_args(args, parser)
|
39 |
+
args.hparams = hparams.parse(args.hparams)
|
40 |
+
# preprocess_librispeech(**vars(args))
|
41 |
+
preprocess_KSponSpeech(**vars(args))
|
42 |
+
|