|
import numpy as np |
|
import kaldiio |
|
|
|
import datasets |
|
from transformers.utils import logging |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
_DESCRIPTION = "Annotated Subtitles" |
|
|
|
_FILEPATHS = { |
|
"fbank_pitch": "/esat/spchtemp/scratch/jponcele/espnet2/dump/fbank_pitch/subs_annot", |
|
"raw": "/esat/spchtemp/scratch/jponcele/espnet2/dump/raw/subs_annot" |
|
} |
|
|
|
_FEATURES_NAME = { |
|
"fbank_pitch": "feats.scp", |
|
"raw": "wav.scp" |
|
} |
|
|
|
|
|
class CGNConfig(datasets.BuilderConfig): |
|
def __init__(self, **kwargs): |
|
""" |
|
Args: |
|
data_dir: `string`, the path to the folder containing the files in the |
|
downloaded .tar |
|
citation: `string`, citation for the data set |
|
url: `string`, url for information about the data set |
|
**kwargs: keyword arguments forwarded to super. |
|
""" |
|
super(CGNConfig, self).__init__(version=datasets.Version("2.6.1", ""), **kwargs) |
|
|
|
|
|
class CGN(datasets.GeneratorBasedBuilder): |
|
|
|
DEFAULT_WRITER_BATCH_SIZE = 256 |
|
DEFAULT_CONFIG_NAME = "raw" |
|
BUILDER_CONFIGS = [ |
|
CGNConfig(name="raw", description="All Components") |
|
] |
|
|
|
def _info(self): |
|
return datasets.DatasetInfo( |
|
description=_DESCRIPTION, |
|
features=datasets.Features( |
|
{ |
|
"audio": datasets.Value("string"), |
|
"text": datasets.Value("string"), |
|
"id": datasets.Value("string"), |
|
} |
|
), |
|
supervised_keys=("text",), |
|
) |
|
|
|
def _split_generators(self, _): |
|
|
|
return [ |
|
datasets.SplitGenerator( |
|
name="test", |
|
gen_kwargs={} |
|
) |
|
] |
|
|
|
def _generate_examples(self): |
|
|
|
data_dirs = [_FILEPATHS[self.config.name]] |
|
for data_dir in data_dirs: |
|
with open(f"{data_dir}/text", "r") as txtfile: |
|
lines = txtfile.readlines() |
|
texts = {line.rstrip().split(' ')[0]: ' '.join(line.rstrip().split(' ')[1:]) for line in lines if len(line.rstrip().split(' ')) > 1} |
|
|
|
featfile = f"{data_dir}/{_FEATURES_NAME[self.config.name]}" |
|
|
|
with open(featfile, "r") as txtfile: |
|
feats_generator = dict(map(lambda s: s.strip().split(maxsplit=1), txtfile)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
for key, (uttid, transcript) in enumerate(texts.items()): |
|
if uttid not in feats_generator: |
|
logger.warning(f"Missing utterance: {uttid}") |
|
continue |
|
|
|
wav = feats_generator[uttid] |
|
|
|
|
|
|
|
|
|
yield key, {"audio": wav, "text": transcript, "id": uttid} |
|
|