File size: 3,153 Bytes
31ad50e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import glob
import numpy
import argparse
import torchaudio
from speechbrain.pretrained import EncoderClassifier
import torch
from tqdm import tqdm
import torch.nn.functional as F
import torchaudio.transforms as T

spk_model = {
    "speechbrain/spkrec-xvect-voxceleb": 512, 
    "speechbrain/spkrec-ecapa-voxceleb": 192,
}

def f2embed(wav_file, classifier, size_embed, resampler=None):
    signal, fs =torchaudio.load(wav_file)
    if fs != 16000 and fs is not None:
        assert fs == 24000, fs
        signal = resampler(signal)
        fs = 16000
    assert fs == 16000, fs
    with torch.no_grad():
        embeddings = classifier.encode_batch(signal)
        embeddings = F.normalize(embeddings, dim=2)
        embeddings = embeddings.squeeze().cpu().numpy()
    assert embeddings.shape[0] == size_embed, embeddings.shape[0]
    return embeddings

def process(args):
    wavlst = []
    for split in args.splits.split(","):
        wav_dir = os.path.join(args.libritts_root, split)
        wavlst_split = glob.glob(os.path.join(wav_dir, "*", "*", "*.wav"))
        print(f"{split} {len(wavlst_split)} utterances.")
        wavlst.extend(wavlst_split)
    spkemb_root = args.output_root
    if not os.path.exists(spkemb_root):
        print(f"Create speaker embedding directory: {spkemb_root}")
        os.mkdir(spkemb_root)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    classifier = EncoderClassifier.from_hparams(source=args.speaker_embed, run_opts={"device": device}, savedir='/tmp')
    size_embed = spk_model[args.speaker_embed]
    resampler = T.Resample(24000, 16000)
    for utt_i in tqdm(wavlst, total=len(wavlst), desc="Extract"):
        utt_id = "-".join(utt_i.split("/")[-3:]).replace(".wav", "")
        utt_emb = f2embed(utt_i, classifier, size_embed, resampler)
        numpy.save(os.path.join(spkemb_root, f"{utt_id}.npy"), utt_emb)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--libritts-root", "-i", required=True, type=str, help="LibriTTS root directory.")
    parser.add_argument("--output-root", "-o", required=True, type=str, help="Output directory.")
    parser.add_argument("--speaker-embed", "-s", type=str, required=True, choices=["speechbrain/spkrec-xvect-voxceleb", "speechbrain/spkrec-ecapa-voxceleb"],
                        help="Pretrained model for extracting speaker emebdding.")
    parser.add_argument("--splits", default="train-clean-100,train-clean-360,dev-clean,test-clean", type=str,
                        help="Split of train,dev,test seperate by comma.")
    args = parser.parse_args()
    print(f"Loading utterances from {args.libritts_root}/{args.splits}, "
        + f"Save speaker embedding 'npy' to {args.output_root}, "
        + f"Using speaker model {args.speaker_embed} with {spk_model[args.speaker_embed]} size.")
    process(args)

if __name__ == "__main__":
    """
    python examples/text_to_speech/prep_libritts_spkemb.py \
        -i /mnt/default/v-junyiao/dataset/Original/LibriTTS \
        -o /mnt/default/v-junyiao/dataset/Original/LibriTTS/spkrec-ecapa \
        -s speechbrain/spkrec-ecapa-voxceleb
    """
    main()