|
from datasets import DatasetBuilder, DatasetInfo, SplitGenerators, DownloadManager |
|
from datasets.features import Features, ClassLabel, Sequence, Value |
|
|
|
class MyCustomDataset(DatasetBuilder): |
|
VERSION = datasets.Version("1.0.0") |
|
|
|
def _info(self): |
|
return DatasetInfo( |
|
description="My custom dataset for tracking objects.", |
|
features=Features({ |
|
"prompting_type": Value("string"), |
|
"deception": Value("bool"), |
|
"story_length": Value("int32"), |
|
"question_order": Value("int32"), |
|
"sample_id": Value("int32"), |
|
"story": Value("string"), |
|
"question": Value("string"), |
|
"choices": Value("string"), |
|
"answer": Value("string"), |
|
}), |
|
supervised_keys=None, |
|
homepage="https://github.com/ying-hui-he/Hi-ToM_dataset", |
|
citation=CITATION, |
|
) |
|
|
|
def _split_generators(self, dl_manager: DownloadManager): |
|
downloaded_files = dl_manager.download_and_extract({ |
|
"data_file": "https://github.com/ying-hui-he/Hi-ToM_dataset/blob/main/Hi-ToM_data.json" |
|
}) |
|
|
|
return [ |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TRAIN, |
|
gen_kwargs={ |
|
"filepath": downloaded_files["data_file"], |
|
}, |
|
), |
|
] |
|
|
|
def _generate_examples(self, filepath): |
|
with open(filepath, encoding="utf-8") as f: |
|
data = json.load(f) |
|
for id, item in enumerate(data["data"]): |
|
yield id, { |
|
"prompting_type": item["prompting_type"], |
|
"deception": item["deception"], |
|
"story_length": item["story_length"], |
|
"question_order": item["question_order"], |
|
"sample_id": item["sample_id"], |
|
"story": item["story"], |
|
"question": item["question"], |
|
"choices": item["choices"], |
|
"answer": item["answer"], |
|
} |
|
|