import os import glob import numpy import argparse import torchaudio from speechbrain.pretrained import EncoderClassifier import torch from tqdm import tqdm import torch.nn.functional as F import torchaudio.transforms as T spk_model = { "speechbrain/spkrec-xvect-voxceleb": 512, "speechbrain/spkrec-ecapa-voxceleb": 192, } def f2embed(wav_file, classifier, size_embed, resampler=None): signal, fs =torchaudio.load(wav_file) if fs != 16000 and fs is not None: assert fs == 24000, fs signal = resampler(signal) fs = 16000 assert fs == 16000, fs with torch.no_grad(): embeddings = classifier.encode_batch(signal) embeddings = F.normalize(embeddings, dim=2) embeddings = embeddings.squeeze().cpu().numpy() assert embeddings.shape[0] == size_embed, embeddings.shape[0] return embeddings def process(args): wavlst = [] for split in args.splits.split(","): wav_dir = os.path.join(args.libritts_root, split) wavlst_split = glob.glob(os.path.join(wav_dir, "*", "*", "*.wav")) print(f"{split} {len(wavlst_split)} utterances.") wavlst.extend(wavlst_split) spkemb_root = args.output_root if not os.path.exists(spkemb_root): print(f"Create speaker embedding directory: {spkemb_root}") os.mkdir(spkemb_root) device = "cuda" if torch.cuda.is_available() else "cpu" classifier = EncoderClassifier.from_hparams(source=args.speaker_embed, run_opts={"device": device}, savedir='/tmp') size_embed = spk_model[args.speaker_embed] resampler = T.Resample(24000, 16000) for utt_i in tqdm(wavlst, total=len(wavlst), desc="Extract"): utt_id = "-".join(utt_i.split("/")[-3:]).replace(".wav", "") utt_emb = f2embed(utt_i, classifier, size_embed, resampler) numpy.save(os.path.join(spkemb_root, f"{utt_id}.npy"), utt_emb) def main(): parser = argparse.ArgumentParser() parser.add_argument("--libritts-root", "-i", required=True, type=str, help="LibriTTS root directory.") parser.add_argument("--output-root", "-o", required=True, type=str, help="Output directory.") parser.add_argument("--speaker-embed", "-s", type=str, required=True, choices=["speechbrain/spkrec-xvect-voxceleb", "speechbrain/spkrec-ecapa-voxceleb"], help="Pretrained model for extracting speaker emebdding.") parser.add_argument("--splits", default="train-clean-100,train-clean-360,dev-clean,test-clean", type=str, help="Split of train,dev,test seperate by comma.") args = parser.parse_args() print(f"Loading utterances from {args.libritts_root}/{args.splits}, " + f"Save speaker embedding 'npy' to {args.output_root}, " + f"Using speaker model {args.speaker_embed} with {spk_model[args.speaker_embed]} size.") process(args) if __name__ == "__main__": """ python examples/text_to_speech/prep_libritts_spkemb.py \ -i /mnt/default/v-junyiao/dataset/Original/LibriTTS \ -o /mnt/default/v-junyiao/dataset/Original/LibriTTS/spkrec-ecapa \ -s speechbrain/spkrec-ecapa-voxceleb """ main()