File size: 2,958 Bytes

60aae99

import numpy as np
import kaldiio

import datasets
from transformers.utils import logging


logger = logging.get_logger(__name__)

_DESCRIPTION = "Annotated Subtitles"

_FILEPATHS = {
    "fbank_pitch": "/esat/spchtemp/scratch/jponcele/espnet2/dump/fbank_pitch/subs_annot",
    "raw": "/esat/spchtemp/scratch/jponcele/espnet2/dump/raw/subs_annot"
}

_FEATURES_NAME = {
    "fbank_pitch": "feats.scp",
    "raw": "wav.scp"
}


class CGNConfig(datasets.BuilderConfig):
    def __init__(self, **kwargs):
        """
        Args:
          data_dir: `string`, the path to the folder containing the files in the
            downloaded .tar
          citation: `string`, citation for the data set
          url: `string`, url for information about the data set
          **kwargs: keyword arguments forwarded to super.
        """
        super(CGNConfig, self).__init__(version=datasets.Version("2.6.1", ""), **kwargs)


class CGN(datasets.GeneratorBasedBuilder):

    DEFAULT_WRITER_BATCH_SIZE = 256
    DEFAULT_CONFIG_NAME = "raw"
    BUILDER_CONFIGS = [
        CGNConfig(name="raw", description="All Components")
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "audio": datasets.Value("string"),
                    "text": datasets.Value("string"),
                    "id": datasets.Value("string"),
                }
            ),
            supervised_keys=("text",),
        )

    def _split_generators(self, _):

        return [
            datasets.SplitGenerator(
                name="test",
                gen_kwargs={}
            )
        ]

    def _generate_examples(self):

        data_dirs = [_FILEPATHS[self.config.name]]
        for data_dir in data_dirs:
            with open(f"{data_dir}/text", "r") as txtfile:
                lines = txtfile.readlines()
            texts = {line.rstrip().split(' ')[0]: ' '.join(line.rstrip().split(' ')[1:]) for line in lines if len(line.rstrip().split(' ')) > 1}

            featfile = f"{data_dir}/{_FEATURES_NAME[self.config.name]}"

            with open(featfile, "r") as txtfile:
                feats_generator = dict(map(lambda s: s.strip().split(maxsplit=1), txtfile))

            #if featfile.endswith(".scp"):
            #    feats_generator = kaldiio.load_scp(featfile)
            #elif featfile.endswith(".npz"):
            #    feats_generator = np.load(featfile)

            for key, (uttid, transcript) in enumerate(texts.items()):
                if uttid not in feats_generator:
                    logger.warning(f"Missing utterance: {uttid}")
                    continue

                wav = feats_generator[uttid]
                #if isinstance(feats, tuple):
                #    sr, feats = feats
                #feats = np.expand_dims(feats, axis=1)

                yield key, {"audio": wav, "text": transcript, "id": uttid}