import numpy as np import kaldiio import datasets from transformers.utils import logging logger = logging.get_logger(__name__) _DESCRIPTION = "Annotated Subtitles" _FILEPATHS = { "fbank_pitch": "/esat/spchtemp/scratch/jponcele/espnet2/dump/fbank_pitch/subs_annot", "raw": "/esat/spchtemp/scratch/jponcele/espnet2/dump/raw/subs_annot" } _FEATURES_NAME = { "fbank_pitch": "feats.scp", "raw": "wav.scp" } class CGNConfig(datasets.BuilderConfig): def __init__(self, **kwargs): """ Args: data_dir: `string`, the path to the folder containing the files in the downloaded .tar citation: `string`, citation for the data set url: `string`, url for information about the data set **kwargs: keyword arguments forwarded to super. """ super(CGNConfig, self).__init__(version=datasets.Version("2.6.1", ""), **kwargs) class CGN(datasets.GeneratorBasedBuilder): DEFAULT_WRITER_BATCH_SIZE = 256 DEFAULT_CONFIG_NAME = "raw" BUILDER_CONFIGS = [ CGNConfig(name="raw", description="All Components") ] def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "audio": datasets.Value("string"), "text": datasets.Value("string"), "id": datasets.Value("string"), } ), supervised_keys=("text",), ) def _split_generators(self, _): return [ datasets.SplitGenerator( name="test", gen_kwargs={} ) ] def _generate_examples(self): data_dirs = [_FILEPATHS[self.config.name]] for data_dir in data_dirs: with open(f"{data_dir}/text", "r") as txtfile: lines = txtfile.readlines() texts = {line.rstrip().split(' ')[0]: ' '.join(line.rstrip().split(' ')[1:]) for line in lines if len(line.rstrip().split(' ')) > 1} featfile = f"{data_dir}/{_FEATURES_NAME[self.config.name]}" with open(featfile, "r") as txtfile: feats_generator = dict(map(lambda s: s.strip().split(maxsplit=1), txtfile)) #if featfile.endswith(".scp"): # feats_generator = kaldiio.load_scp(featfile) #elif featfile.endswith(".npz"): # feats_generator = np.load(featfile) for key, (uttid, transcript) in enumerate(texts.items()): if uttid not in feats_generator: logger.warning(f"Missing utterance: {uttid}") continue wav = feats_generator[uttid] #if isinstance(feats, tuple): # sr, feats = feats #feats = np.expand_dims(feats, axis=1) yield key, {"audio": wav, "text": transcript, "id": uttid}