speecht5-tts / manifest /utils /prep_libritts_spkemb.py
wr
set *.tsv and *.txt to large file
31ad50e
raw
history blame
3.15 kB
import os
import glob
import numpy
import argparse
import torchaudio
from speechbrain.pretrained import EncoderClassifier
import torch
from tqdm import tqdm
import torch.nn.functional as F
import torchaudio.transforms as T
spk_model = {
"speechbrain/spkrec-xvect-voxceleb": 512,
"speechbrain/spkrec-ecapa-voxceleb": 192,
}
def f2embed(wav_file, classifier, size_embed, resampler=None):
signal, fs =torchaudio.load(wav_file)
if fs != 16000 and fs is not None:
assert fs == 24000, fs
signal = resampler(signal)
fs = 16000
assert fs == 16000, fs
with torch.no_grad():
embeddings = classifier.encode_batch(signal)
embeddings = F.normalize(embeddings, dim=2)
embeddings = embeddings.squeeze().cpu().numpy()
assert embeddings.shape[0] == size_embed, embeddings.shape[0]
return embeddings
def process(args):
wavlst = []
for split in args.splits.split(","):
wav_dir = os.path.join(args.libritts_root, split)
wavlst_split = glob.glob(os.path.join(wav_dir, "*", "*", "*.wav"))
print(f"{split} {len(wavlst_split)} utterances.")
wavlst.extend(wavlst_split)
spkemb_root = args.output_root
if not os.path.exists(spkemb_root):
print(f"Create speaker embedding directory: {spkemb_root}")
os.mkdir(spkemb_root)
device = "cuda" if torch.cuda.is_available() else "cpu"
classifier = EncoderClassifier.from_hparams(source=args.speaker_embed, run_opts={"device": device}, savedir='/tmp')
size_embed = spk_model[args.speaker_embed]
resampler = T.Resample(24000, 16000)
for utt_i in tqdm(wavlst, total=len(wavlst), desc="Extract"):
utt_id = "-".join(utt_i.split("/")[-3:]).replace(".wav", "")
utt_emb = f2embed(utt_i, classifier, size_embed, resampler)
numpy.save(os.path.join(spkemb_root, f"{utt_id}.npy"), utt_emb)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--libritts-root", "-i", required=True, type=str, help="LibriTTS root directory.")
parser.add_argument("--output-root", "-o", required=True, type=str, help="Output directory.")
parser.add_argument("--speaker-embed", "-s", type=str, required=True, choices=["speechbrain/spkrec-xvect-voxceleb", "speechbrain/spkrec-ecapa-voxceleb"],
help="Pretrained model for extracting speaker emebdding.")
parser.add_argument("--splits", default="train-clean-100,train-clean-360,dev-clean,test-clean", type=str,
help="Split of train,dev,test seperate by comma.")
args = parser.parse_args()
print(f"Loading utterances from {args.libritts_root}/{args.splits}, "
+ f"Save speaker embedding 'npy' to {args.output_root}, "
+ f"Using speaker model {args.speaker_embed} with {spk_model[args.speaker_embed]} size.")
process(args)
if __name__ == "__main__":
"""
python examples/text_to_speech/prep_libritts_spkemb.py \
-i /mnt/default/v-junyiao/dataset/Original/LibriTTS \
-o /mnt/default/v-junyiao/dataset/Original/LibriTTS/spkrec-ecapa \
-s speechbrain/spkrec-ecapa-voxceleb
"""
main()