Spaces:
Running
Running
File size: 1,589 Bytes
c56c253 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
from speaker_encoder import inference as encoder
from multiprocessing.pool import Pool
from functools import partial
from pathlib import Path
# from utils import logmmse
# from tqdm import tqdm
# import numpy as np
# import librosa
def embed_utterance(fpaths, encoder_model_fpath):
if not encoder.is_loaded():
encoder.load_model(encoder_model_fpath)
# Compute the speaker embedding of the utterance
wav_fpath, embed_fpath = fpaths
wav = np.load(wav_fpath)
wav = encoder.preprocess_wav(wav)
embed = encoder.embed_utterance(wav)
np.save(embed_fpath, embed, allow_pickle=False)
def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int):
wav_dir = outdir_root.joinpath("audio")
metadata_fpath = synthesizer_root.joinpath("train.txt")
assert wav_dir.exists() and metadata_fpath.exists()
embed_dir = synthesizer_root.joinpath("embeds")
embed_dir.mkdir(exist_ok=True)
# Gather the input wave filepath and the target output embed filepath
with metadata_fpath.open("r") as metadata_file:
metadata = [line.split("|") for line in metadata_file]
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
# Embed the utterances in separate threads
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) |