|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
import os |
|
import re |
|
from argparse import ArgumentParser |
|
from multiprocessing import Pool |
|
|
|
import tensorflow as tf |
|
from sacremoses import MosesDetokenizer |
|
from tasks_splits_and_features import _TASK_SPLITS_AND_FEATURES_DICT |
|
|
|
|
|
""" |
|
This script converts the P3 dataset used to train T0 from a tfrecords format to individual JSONL files. |
|
|
|
Use instructions: |
|
|
|
NOTE: This script requires tensorflow to be installed. |
|
|
|
1. Download the P3 dataset by cloning it from Huggingface: |
|
git clone https://huggingface.co/datasets/bigscience/P3. The raw data should be at P3/data. |
|
2. Run this script: |
|
python t0_dataset_preproc.py \ |
|
--p3_dataset_path P3/data \ |
|
--jsonl_output_path P3/data_processed_jsonl |
|
3. The output will be in the jsonl_output_path directory. In the following structure: |
|
- P3/data_processed_jsonl/train |
|
- super_glue_cb_does_this_imply.jsonl |
|
- super_glue_cb_justified_in_saying_score_eval.jsonl |
|
- ..... |
|
- P3/data_processed_jsonl/val |
|
- super_glue_cb_does_this_imply.jsonl |
|
- super_glue_cb_justified_in_saying_score_eval.jsonl |
|
- ..... |
|
4. Each JSONL file is compatible with NeMo's T0JSONLMemMapDataset (https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/data/language_modeling/t0_dataset.py) |
|
""" |
|
|
|
|
|
def _feature_config(shape, dtype): |
|
if dtype in ("int32", "bool"): |
|
|
|
dtype = "int64" |
|
if shape and shape[0] is None: |
|
return tf.io.FixedLenSequenceFeature(shape[1:], dtype, allow_missing=True) |
|
return tf.io.FixedLenFeature(shape, dtype) |
|
|
|
|
|
def remove_newline_and_detokenize(x, detokenizer, remove_newlines): |
|
if remove_newlines: |
|
x = re.sub(r'\\n+', ' ', x) |
|
x = re.sub(r'\n+', ' ', x) |
|
x = re.sub(r'\\r+', ' ', x) |
|
x = re.sub(r'\r+', ' ', x) |
|
x = x.strip() |
|
|
|
if remove_newlines: |
|
x = detokenizer.detokenize([x]) |
|
return x |
|
|
|
|
|
def write_dataset_to_file(dataset, filename, detokenizer, remove_newlines): |
|
with open(filename, 'w') as f: |
|
for item in dataset: |
|
|
|
if 'is_correct' in item and item['is_correct'].numpy().tolist() is False: |
|
print('Skipping example because is_correct is False') |
|
continue |
|
|
|
item_object = {} |
|
i = remove_newline_and_detokenize( |
|
item['inputs_pretokenized'].numpy().decode('utf-8'), detokenizer, remove_newlines |
|
) |
|
item_object['input'] = i |
|
t = remove_newline_and_detokenize( |
|
item['targets_pretokenized'].numpy().decode('utf-8'), detokenizer, remove_newlines |
|
) |
|
item_object['output'] = t |
|
if 'answer_choices' in item: |
|
choices = [ |
|
remove_newline_and_detokenize(x.decode('utf-8'), detokenizer, remove_newlines) |
|
for x in item['answer_choices'].numpy().tolist() |
|
] |
|
item_object['choices'] = choices |
|
f.write(json.dumps(item_object) + '\n') |
|
|
|
|
|
def write_train_val_test_dataset_to_file(file_name, folder_name, output_folder, detokenizer, split, remove_newlines): |
|
ds = tf.data.TFRecordDataset(tf.io.gfile.glob([file_name])) |
|
fdict = _TASK_SPLITS_AND_FEATURES_DICT[folder_name]['features_dict'] |
|
feature_description = {feat: _feature_config(**desc) for feat, desc in fdict.items()} |
|
ds = ds.map( |
|
lambda pb: tf.io.parse_single_example(pb, feature_description), |
|
num_parallel_calls=tf.data.experimental.AUTOTUNE, |
|
) |
|
ds = ds.map( |
|
lambda x: {k: tf.cast(v, fdict[k]["dtype"]) for k, v in x.items()}, |
|
num_parallel_calls=tf.data.experimental.AUTOTUNE, |
|
) |
|
write_dataset_to_file(ds, os.path.join(output_folder, split, folder_name + '.jsonl'), detokenizer, remove_newlines) |
|
|
|
|
|
def process_folder(data_folder, folder_name, output_folder, detokenizer, remove_newlines): |
|
if not os.path.isdir(os.path.join(data_folder, folder_name)): |
|
return |
|
print(f'Processing {folder_name}') |
|
train_fname = os.path.join(data_folder, folder_name, 'train.tfrecord-00000-of-00001') |
|
valid_fname = os.path.join(data_folder, folder_name, 'validation.tfrecord-00000-of-00001') |
|
test_fname = os.path.join(data_folder, folder_name, 'test.tfrecord-00000-of-00001') |
|
if not os.path.exists(train_fname): |
|
print(f'Could not find {train_fname}') |
|
return |
|
write_train_val_test_dataset_to_file( |
|
train_fname, folder_name, output_folder, detokenizer, 'train', remove_newlines |
|
) |
|
if os.path.exists(valid_fname): |
|
write_train_val_test_dataset_to_file( |
|
valid_fname, folder_name, output_folder, detokenizer, 'val', remove_newlines |
|
) |
|
if os.path.exists(test_fname): |
|
write_train_val_test_dataset_to_file( |
|
test_fname, folder_name, output_folder, detokenizer, 'test', remove_newlines |
|
) |
|
|
|
|
|
def process_all_folders(data_folder, output_folder, remove_newlines): |
|
detokenizer = MosesDetokenizer('en') |
|
assert os.path.isdir(data_folder) |
|
if not os.path.exists(output_folder): |
|
os.system(f'mkdir -p {output_folder}') |
|
if not os.path.exists(os.path.join(output_folder, 'train')): |
|
os.system(f'mkdir -p {os.path.join(output_folder, "train")}') |
|
if not os.path.exists(os.path.join(output_folder, 'val')): |
|
os.system(f'mkdir -p {os.path.join(output_folder, "val")}') |
|
if not os.path.exists(os.path.join(output_folder, 'test')): |
|
os.system(f'mkdir -p {os.path.join(output_folder, "test")}') |
|
|
|
print(f'Found {len(os.listdir(data_folder))} folders to process ...') |
|
pool_args = [] |
|
for folder_name in os.listdir(data_folder): |
|
pool_args.append((data_folder, folder_name, output_folder, detokenizer, remove_newlines)) |
|
pool = Pool() |
|
pool.starmap(process_folder, pool_args) |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = ArgumentParser() |
|
parser.add_argument( |
|
"--p3_dataset_path", |
|
type=str, |
|
required=True, |
|
help="Path to raw P3 data. Should be a folder containing folders for each task. After cloning the repo this should correspond to P3/data", |
|
) |
|
parser.add_argument( |
|
"--jsonl_output_path", |
|
type=str, |
|
required=True, |
|
help="Path to output folder where JSONL files will be written.", |
|
) |
|
parser.add_argument( |
|
"--remove_newlines", action="store_true", help="Whether to remove newlines from the input and output.", |
|
) |
|
args = parser.parse_args() |
|
process_all_folders(args.p3_dataset_path, args.jsonl_output_path, args.remove_newlines) |
|
|