|
from speaker_encoder import inference as encoder
|
|
from multiprocessing.pool import Pool
|
|
from functools import partial
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def embed_utterance(fpaths, encoder_model_fpath):
|
|
if not encoder.is_loaded():
|
|
encoder.load_model(encoder_model_fpath)
|
|
|
|
|
|
wav_fpath, embed_fpath = fpaths
|
|
wav = np.load(wav_fpath)
|
|
wav = encoder.preprocess_wav(wav)
|
|
embed = encoder.embed_utterance(wav)
|
|
np.save(embed_fpath, embed, allow_pickle=False)
|
|
|
|
|
|
def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int):
|
|
|
|
wav_dir = outdir_root.joinpath("audio")
|
|
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
|
assert wav_dir.exists() and metadata_fpath.exists()
|
|
embed_dir = synthesizer_root.joinpath("embeds")
|
|
embed_dir.mkdir(exist_ok=True)
|
|
|
|
|
|
with metadata_fpath.open("r") as metadata_file:
|
|
metadata = [line.split("|") for line in metadata_file]
|
|
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
|
|
|
|
|
|
|
|
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
|
job = Pool(n_processes).imap(func, fpaths)
|
|
list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) |