crystal-technologies's picture
Upload 1287 files
2d8da09
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import re
from argparse import ArgumentParser
from multiprocessing import Pool
import tensorflow as tf
from sacremoses import MosesDetokenizer
from tasks_splits_and_features import _TASK_SPLITS_AND_FEATURES_DICT
"""
This script converts the P3 dataset used to train T0 from a tfrecords format to individual JSONL files.
Use instructions:
NOTE: This script requires tensorflow to be installed.
1. Download the P3 dataset by cloning it from Huggingface:
git clone https://huggingface.co/datasets/bigscience/P3. The raw data should be at P3/data.
2. Run this script:
python t0_dataset_preproc.py \
--p3_dataset_path P3/data \
--jsonl_output_path P3/data_processed_jsonl
3. The output will be in the jsonl_output_path directory. In the following structure:
- P3/data_processed_jsonl/train
- super_glue_cb_does_this_imply.jsonl
- super_glue_cb_justified_in_saying_score_eval.jsonl
- .....
- P3/data_processed_jsonl/val
- super_glue_cb_does_this_imply.jsonl
- super_glue_cb_justified_in_saying_score_eval.jsonl
- .....
4. Each JSONL file is compatible with NeMo's T0JSONLMemMapDataset (https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/data/language_modeling/t0_dataset.py)
"""
def _feature_config(shape, dtype):
if dtype in ("int32", "bool"):
# int32 and bool are stored as int64 in the tf.train.Example protobuf.
dtype = "int64"
if shape and shape[0] is None:
return tf.io.FixedLenSequenceFeature(shape[1:], dtype, allow_missing=True)
return tf.io.FixedLenFeature(shape, dtype)
def remove_newline_and_detokenize(x, detokenizer, remove_newlines):
if remove_newlines:
x = re.sub(r'\\n+', ' ', x)
x = re.sub(r'\n+', ' ', x)
x = re.sub(r'\\r+', ' ', x)
x = re.sub(r'\r+', ' ', x)
x = x.strip()
# NOTE: Moving the detokenizer inside this condition since sacremoses detokenize seems to remove \n as well.
if remove_newlines:
x = detokenizer.detokenize([x])
return x
def write_dataset_to_file(dataset, filename, detokenizer, remove_newlines):
with open(filename, 'w') as f:
for item in dataset:
# NOTE: Although we do `.tolist()` here this is not actually a list. This is just to convert from a numpy to python object so we can check if it is True/False.
if 'is_correct' in item and item['is_correct'].numpy().tolist() is False:
print('Skipping example because is_correct is False')
continue
item_object = {}
i = remove_newline_and_detokenize(
item['inputs_pretokenized'].numpy().decode('utf-8'), detokenizer, remove_newlines
)
item_object['input'] = i
t = remove_newline_and_detokenize(
item['targets_pretokenized'].numpy().decode('utf-8'), detokenizer, remove_newlines
)
item_object['output'] = t
if 'answer_choices' in item:
choices = [
remove_newline_and_detokenize(x.decode('utf-8'), detokenizer, remove_newlines)
for x in item['answer_choices'].numpy().tolist()
]
item_object['choices'] = choices
f.write(json.dumps(item_object) + '\n')
def write_train_val_test_dataset_to_file(file_name, folder_name, output_folder, detokenizer, split, remove_newlines):
ds = tf.data.TFRecordDataset(tf.io.gfile.glob([file_name]))
fdict = _TASK_SPLITS_AND_FEATURES_DICT[folder_name]['features_dict']
feature_description = {feat: _feature_config(**desc) for feat, desc in fdict.items()}
ds = ds.map(
lambda pb: tf.io.parse_single_example(pb, feature_description),
num_parallel_calls=tf.data.experimental.AUTOTUNE,
)
ds = ds.map(
lambda x: {k: tf.cast(v, fdict[k]["dtype"]) for k, v in x.items()},
num_parallel_calls=tf.data.experimental.AUTOTUNE,
)
write_dataset_to_file(ds, os.path.join(output_folder, split, folder_name + '.jsonl'), detokenizer, remove_newlines)
def process_folder(data_folder, folder_name, output_folder, detokenizer, remove_newlines):
if not os.path.isdir(os.path.join(data_folder, folder_name)):
return
print(f'Processing {folder_name}')
train_fname = os.path.join(data_folder, folder_name, 'train.tfrecord-00000-of-00001')
valid_fname = os.path.join(data_folder, folder_name, 'validation.tfrecord-00000-of-00001')
test_fname = os.path.join(data_folder, folder_name, 'test.tfrecord-00000-of-00001')
if not os.path.exists(train_fname):
print(f'Could not find {train_fname}')
return
write_train_val_test_dataset_to_file(
train_fname, folder_name, output_folder, detokenizer, 'train', remove_newlines
)
if os.path.exists(valid_fname):
write_train_val_test_dataset_to_file(
valid_fname, folder_name, output_folder, detokenizer, 'val', remove_newlines
)
if os.path.exists(test_fname):
write_train_val_test_dataset_to_file(
test_fname, folder_name, output_folder, detokenizer, 'test', remove_newlines
)
def process_all_folders(data_folder, output_folder, remove_newlines):
detokenizer = MosesDetokenizer('en')
assert os.path.isdir(data_folder)
if not os.path.exists(output_folder):
os.system(f'mkdir -p {output_folder}')
if not os.path.exists(os.path.join(output_folder, 'train')):
os.system(f'mkdir -p {os.path.join(output_folder, "train")}')
if not os.path.exists(os.path.join(output_folder, 'val')):
os.system(f'mkdir -p {os.path.join(output_folder, "val")}')
if not os.path.exists(os.path.join(output_folder, 'test')):
os.system(f'mkdir -p {os.path.join(output_folder, "test")}')
print(f'Found {len(os.listdir(data_folder))} folders to process ...')
pool_args = []
for folder_name in os.listdir(data_folder):
pool_args.append((data_folder, folder_name, output_folder, detokenizer, remove_newlines))
pool = Pool()
pool.starmap(process_folder, pool_args)
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument(
"--p3_dataset_path",
type=str,
required=True,
help="Path to raw P3 data. Should be a folder containing folders for each task. After cloning the repo this should correspond to P3/data",
)
parser.add_argument(
"--jsonl_output_path",
type=str,
required=True,
help="Path to output folder where JSONL files will be written.",
)
parser.add_argument(
"--remove_newlines", action="store_true", help="Whether to remove newlines from the input and output.",
)
args = parser.parse_args()
process_all_folders(args.p3_dataset_path, args.jsonl_output_path, args.remove_newlines)