|
import os |
|
import json |
|
import datasets |
|
from pathlib import Path |
|
|
|
|
|
_DESCRIPTION = "Gigaword dataset" |
|
_DOCUMENT = "document" |
|
_ID = "id" |
|
|
|
|
|
class GigawordDataset(datasets.GeneratorBasedBuilder): |
|
|
|
VERSION = datasets.Version("1.0.0") |
|
|
|
def _info(self): |
|
return datasets.DatasetInfo( |
|
description=_DESCRIPTION, |
|
features=datasets.Features( |
|
{ |
|
_DOCUMENT: datasets.Value("string"), |
|
_ID: datasets.Value("string"), |
|
} |
|
), |
|
|
|
) |
|
|
|
def _split_generators(self, dl_manager): |
|
"""Returns SplitGenerators.""" |
|
data_dir = dl_manager._data_dir |
|
return [ |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TRAIN, |
|
gen_kwargs={"path": os.path.join(data_dir, "train.jsonl"), "name": "train"} |
|
), |
|
datasets.SplitGenerator( |
|
name=datasets.Split.VALIDATION, |
|
gen_kwargs={"path": os.path.join(data_dir, "val.jsonl"), "name": "validation"} |
|
), |
|
] |
|
|
|
def _generate_examples(self, path=None, name=None): |
|
"""Yields examples.""" |
|
with open(path, encoding="utf-8") as f: |
|
for i, line in enumerate(f): |
|
x = json.loads(line) |
|
id = x["id"] |
|
item = { |
|
_ID: id, |
|
_DOCUMENT: x["text"], |
|
} |
|
yield id, item |
|
|