Spaces:
Runtime error
Runtime error
File size: 2,447 Bytes
62e9ca6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# ----------------------------------------------------------------------------
# SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329)
# Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM
# Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4
#
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# ----------------------------------------------------------------------------
import argparse
import logging
from pathlib import Path
from collections import defaultdict
import pandas as pd
from tqdm import tqdm
import numpy as np
from examples.speech_to_text.data_utils import save_df_to_tsv
log = logging.getLogger(__name__)
def get_duration(fa_phone):
"""fa_phone: force-aligned phone, 1-D numpy"""
same = np.concatenate(([True], fa_phone[:-1] != fa_phone[1:], [True]))
index = np.where(same)[0]
count = np.diff(index)
return count
def process(args):
# assert "train" in args.splits
out_root = Path(args.output_root).absolute()
out_root.mkdir(exist_ok=True)
print("Fetching data...")
audio_manifest_root = Path(args.audio_manifest_root).absolute()
for s in args.splits:
manifest = defaultdict(list)
with open(audio_manifest_root / f"{s}.phn") as f1:
for i, reduced_phone in tqdm(enumerate(f1)):
reduced_phone = reduced_phone.strip()
uttid = f"librilm-{i}"
speaker = uttid.split("-")[0]
manifest["id"].append(uttid)
manifest["speaker"].append(speaker)
manifest["n_frames"].append(len(reduced_phone))
manifest["tgt_text"].append(reduced_phone)
manifest["unit"].append(0)
save_df_to_tsv(
pd.DataFrame.from_dict(manifest),
out_root / f"{s}.tsv"
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--audio-manifest-root", "-m", type=str)
parser.add_argument("--output-root", "-o", required=True, type=str)
parser.add_argument("--splits", "-s", type=str, nargs="+",
default=["train", "dev", "test"])
parser.add_argument("--add-fastspeech-targets", action="store_true")
args = parser.parse_args()
process(args)
if __name__ == "__main__":
main()
|