|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
from collections import defaultdict |
|
from itertools import chain |
|
from pathlib import Path |
|
|
|
import numpy as np |
|
import torchaudio |
|
import torchaudio.sox_effects as ta_sox |
|
import yaml |
|
from tqdm import tqdm |
|
|
|
from examples.speech_to_text.data_utils import load_tsv_to_dicts |
|
from examples.speech_synthesis.preprocessing.speaker_embedder import SpkrEmbedder |
|
|
|
|
|
def extract_embedding(audio_path, embedder): |
|
wav, sr = torchaudio.load(audio_path) |
|
if sr != embedder.RATE: |
|
wav, sr = ta_sox.apply_effects_tensor( |
|
wav, sr, [["rate", str(embedder.RATE)]] |
|
) |
|
try: |
|
emb = embedder([wav[0].cuda().float()]).cpu().numpy() |
|
except RuntimeError: |
|
emb = None |
|
return emb |
|
|
|
|
|
def process(args): |
|
print("Fetching data...") |
|
raw_manifest_root = Path(args.raw_manifest_root).absolute() |
|
samples = [load_tsv_to_dicts(raw_manifest_root / (s + ".tsv")) |
|
for s in args.splits] |
|
samples = list(chain(*samples)) |
|
with open(args.config, "r") as f: |
|
config = yaml.load(f, Loader=yaml.FullLoader) |
|
with open(f"{config['audio_root']}/{config['speaker_set_filename']}") as f: |
|
speaker_to_id = {r.strip(): i for i, r in enumerate(f)} |
|
|
|
embedder = SpkrEmbedder(args.ckpt).cuda() |
|
speaker_to_cnt = defaultdict(float) |
|
speaker_to_emb = defaultdict(float) |
|
for sample in tqdm(samples, desc="extract emb"): |
|
emb = extract_embedding(sample["audio"], embedder) |
|
if emb is not None: |
|
speaker_to_cnt[sample["speaker"]] += 1 |
|
speaker_to_emb[sample["speaker"]] += emb |
|
if len(speaker_to_emb) != len(speaker_to_id): |
|
missed = set(speaker_to_id) - set(speaker_to_emb.keys()) |
|
print( |
|
f"WARNING: missing embeddings for {len(missed)} speaker:\n{missed}" |
|
) |
|
speaker_emb_mat = np.zeros((len(speaker_to_id), len(emb)), float) |
|
for speaker in speaker_to_emb: |
|
idx = speaker_to_id[speaker] |
|
emb = speaker_to_emb[speaker] |
|
cnt = speaker_to_cnt[speaker] |
|
speaker_emb_mat[idx, :] = emb / cnt |
|
speaker_emb_name = "speaker_emb.npy" |
|
speaker_emb_path = f"{config['audio_root']}/{speaker_emb_name}" |
|
np.save(speaker_emb_path, speaker_emb_mat) |
|
config["speaker_emb_filename"] = speaker_emb_name |
|
|
|
with open(args.new_config, "w") as f: |
|
yaml.dump(config, f) |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--raw-manifest-root", "-m", required=True, type=str) |
|
parser.add_argument("--splits", "-s", type=str, nargs="+", |
|
default=["train"]) |
|
parser.add_argument("--config", "-c", required=True, type=str) |
|
parser.add_argument("--new-config", "-n", required=True, type=str) |
|
parser.add_argument("--ckpt", required=True, type=str, |
|
help="speaker embedder checkpoint") |
|
args = parser.parse_args() |
|
|
|
process(args) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|