File size: 7,512 Bytes

2d8da09

# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import re
from argparse import ArgumentParser
from multiprocessing import Pool

import tensorflow as tf
from sacremoses import MosesDetokenizer
from tasks_splits_and_features import _TASK_SPLITS_AND_FEATURES_DICT


"""
This script converts the P3 dataset used to train T0 from a tfrecords format to individual JSONL files.

Use instructions:

NOTE: This script requires tensorflow to be installed.

1. Download the P3 dataset by cloning it from Huggingface:
        git clone https://huggingface.co/datasets/bigscience/P3. The raw data should be at P3/data.
2. Run this script: 
    python t0_dataset_preproc.py \
        --p3_dataset_path P3/data \
        --jsonl_output_path P3/data_processed_jsonl
3. The output will be in the jsonl_output_path directory. In the following structure:
    - P3/data_processed_jsonl/train
       - super_glue_cb_does_this_imply.jsonl
       - super_glue_cb_justified_in_saying_score_eval.jsonl
       - .....
    - P3/data_processed_jsonl/val
       - super_glue_cb_does_this_imply.jsonl
       - super_glue_cb_justified_in_saying_score_eval.jsonl
       - .....
4. Each JSONL file is compatible with NeMo's T0JSONLMemMapDataset (https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/data/language_modeling/t0_dataset.py)
"""


def _feature_config(shape, dtype):
    if dtype in ("int32", "bool"):
        # int32 and bool are stored as int64 in the tf.train.Example protobuf.
        dtype = "int64"
    if shape and shape[0] is None:
        return tf.io.FixedLenSequenceFeature(shape[1:], dtype, allow_missing=True)
    return tf.io.FixedLenFeature(shape, dtype)


def remove_newline_and_detokenize(x, detokenizer, remove_newlines):
    if remove_newlines:
        x = re.sub(r'\\n+', ' ', x)
        x = re.sub(r'\n+', ' ', x)
        x = re.sub(r'\\r+', ' ', x)
        x = re.sub(r'\r+', ' ', x)
    x = x.strip()
    # NOTE: Moving the detokenizer inside this condition since sacremoses detokenize seems to remove \n as well.
    if remove_newlines:
        x = detokenizer.detokenize([x])
    return x


def write_dataset_to_file(dataset, filename, detokenizer, remove_newlines):
    with open(filename, 'w') as f:
        for item in dataset:
            # NOTE: Although we do `.tolist()` here this is not actually a list. This is just to convert from a numpy to python object so we can check if it is True/False.
            if 'is_correct' in item and item['is_correct'].numpy().tolist() is False:
                print('Skipping example because is_correct is False')
                continue

            item_object = {}
            i = remove_newline_and_detokenize(
                item['inputs_pretokenized'].numpy().decode('utf-8'), detokenizer, remove_newlines
            )
            item_object['input'] = i
            t = remove_newline_and_detokenize(
                item['targets_pretokenized'].numpy().decode('utf-8'), detokenizer, remove_newlines
            )
            item_object['output'] = t
            if 'answer_choices' in item:
                choices = [
                    remove_newline_and_detokenize(x.decode('utf-8'), detokenizer, remove_newlines)
                    for x in item['answer_choices'].numpy().tolist()
                ]
                item_object['choices'] = choices
            f.write(json.dumps(item_object) + '\n')


def write_train_val_test_dataset_to_file(file_name, folder_name, output_folder, detokenizer, split, remove_newlines):
    ds = tf.data.TFRecordDataset(tf.io.gfile.glob([file_name]))
    fdict = _TASK_SPLITS_AND_FEATURES_DICT[folder_name]['features_dict']
    feature_description = {feat: _feature_config(**desc) for feat, desc in fdict.items()}
    ds = ds.map(
        lambda pb: tf.io.parse_single_example(pb, feature_description),
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(
        lambda x: {k: tf.cast(v, fdict[k]["dtype"]) for k, v in x.items()},
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
    )
    write_dataset_to_file(ds, os.path.join(output_folder, split, folder_name + '.jsonl'), detokenizer, remove_newlines)


def process_folder(data_folder, folder_name, output_folder, detokenizer, remove_newlines):
    if not os.path.isdir(os.path.join(data_folder, folder_name)):
        return
    print(f'Processing {folder_name}')
    train_fname = os.path.join(data_folder, folder_name, 'train.tfrecord-00000-of-00001')
    valid_fname = os.path.join(data_folder, folder_name, 'validation.tfrecord-00000-of-00001')
    test_fname = os.path.join(data_folder, folder_name, 'test.tfrecord-00000-of-00001')
    if not os.path.exists(train_fname):
        print(f'Could not find {train_fname}')
        return
    write_train_val_test_dataset_to_file(
        train_fname, folder_name, output_folder, detokenizer, 'train', remove_newlines
    )
    if os.path.exists(valid_fname):
        write_train_val_test_dataset_to_file(
            valid_fname, folder_name, output_folder, detokenizer, 'val', remove_newlines
        )
    if os.path.exists(test_fname):
        write_train_val_test_dataset_to_file(
            test_fname, folder_name, output_folder, detokenizer, 'test', remove_newlines
        )


def process_all_folders(data_folder, output_folder, remove_newlines):
    detokenizer = MosesDetokenizer('en')
    assert os.path.isdir(data_folder)
    if not os.path.exists(output_folder):
        os.system(f'mkdir -p {output_folder}')
    if not os.path.exists(os.path.join(output_folder, 'train')):
        os.system(f'mkdir -p {os.path.join(output_folder, "train")}')
    if not os.path.exists(os.path.join(output_folder, 'val')):
        os.system(f'mkdir -p {os.path.join(output_folder, "val")}')
    if not os.path.exists(os.path.join(output_folder, 'test')):
        os.system(f'mkdir -p {os.path.join(output_folder, "test")}')

    print(f'Found {len(os.listdir(data_folder))} folders to process ...')
    pool_args = []
    for folder_name in os.listdir(data_folder):
        pool_args.append((data_folder, folder_name, output_folder, detokenizer, remove_newlines))
    pool = Pool()
    pool.starmap(process_folder, pool_args)


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument(
        "--p3_dataset_path",
        type=str,
        required=True,
        help="Path to raw P3 data. Should be a folder containing folders for each task. After cloning the repo this should correspond to P3/data",
    )
    parser.add_argument(
        "--jsonl_output_path",
        type=str,
        required=True,
        help="Path to output folder where JSONL files will be written.",
    )
    parser.add_argument(
        "--remove_newlines", action="store_true", help="Whether to remove newlines from the input and output.",
    )
    args = parser.parse_args()
    process_all_folders(args.p3_dataset_path, args.jsonl_output_path, args.remove_newlines)