crystal-technologies
/

CRYSTAL-R1

Model card Files Files and versions Community

CRYSTAL-R1 / SoundScribe /SpeakerID /scripts /nlp_language_modeling /t0 /t0_dataset_preproc.py

crystal-technologies

Upload 1287 files

2d8da09 over 1 year ago

raw

history blame contribute delete

7.51 kB

	# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import json
	import os
	import re
	from argparse import ArgumentParser
	from multiprocessing import Pool

	import tensorflow as tf
	from sacremoses import MosesDetokenizer
	from tasks_splits_and_features import _TASK_SPLITS_AND_FEATURES_DICT


	"""
	This script converts the P3 dataset used to train T0 from a tfrecords format to individual JSONL files.

	Use instructions:

	NOTE: This script requires tensorflow to be installed.

	1. Download the P3 dataset by cloning it from Huggingface:
	git clone https://huggingface.co/datasets/bigscience/P3. The raw data should be at P3/data.
	2. Run this script:
	python t0_dataset_preproc.py \
	--p3_dataset_path P3/data \
	--jsonl_output_path P3/data_processed_jsonl
	3. The output will be in the jsonl_output_path directory. In the following structure:
	- P3/data_processed_jsonl/train
	- super_glue_cb_does_this_imply.jsonl
	- super_glue_cb_justified_in_saying_score_eval.jsonl
	- .....
	- P3/data_processed_jsonl/val
	- super_glue_cb_does_this_imply.jsonl
	- super_glue_cb_justified_in_saying_score_eval.jsonl
	- .....
	4. Each JSONL file is compatible with NeMo's T0JSONLMemMapDataset (https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/data/language_modeling/t0_dataset.py)
	"""


	def _feature_config(shape, dtype):
	if dtype in ("int32", "bool"):
	# int32 and bool are stored as int64 in the tf.train.Example protobuf.
	dtype = "int64"
	if shape and shape[0] is None:
	return tf.io.FixedLenSequenceFeature(shape[1:], dtype, allow_missing=True)
	return tf.io.FixedLenFeature(shape, dtype)


	def remove_newline_and_detokenize(x, detokenizer, remove_newlines):
	if remove_newlines:
	x = re.sub(r'\\n+', ' ', x)
	x = re.sub(r'\n+', ' ', x)
	x = re.sub(r'\\r+', ' ', x)
	x = re.sub(r'\r+', ' ', x)
	x = x.strip()
	# NOTE: Moving the detokenizer inside this condition since sacremoses detokenize seems to remove \n as well.
	if remove_newlines:
	x = detokenizer.detokenize([x])
	return x


	def write_dataset_to_file(dataset, filename, detokenizer, remove_newlines):
	with open(filename, 'w') as f:
	for item in dataset:
	# NOTE: Although we do `.tolist()` here this is not actually a list. This is just to convert from a numpy to python object so we can check if it is True/False.
	if 'is_correct' in item and item['is_correct'].numpy().tolist() is False:
	print('Skipping example because is_correct is False')
	continue

	item_object = {}
	i = remove_newline_and_detokenize(
	item['inputs_pretokenized'].numpy().decode('utf-8'), detokenizer, remove_newlines
	)
	item_object['input'] = i
	t = remove_newline_and_detokenize(
	item['targets_pretokenized'].numpy().decode('utf-8'), detokenizer, remove_newlines
	)
	item_object['output'] = t
	if 'answer_choices' in item:
	choices = [
	remove_newline_and_detokenize(x.decode('utf-8'), detokenizer, remove_newlines)
	for x in item['answer_choices'].numpy().tolist()
	]
	item_object['choices'] = choices
	f.write(json.dumps(item_object) + '\n')


	def write_train_val_test_dataset_to_file(file_name, folder_name, output_folder, detokenizer, split, remove_newlines):
	ds = tf.data.TFRecordDataset(tf.io.gfile.glob([file_name]))
	fdict = _TASK_SPLITS_AND_FEATURES_DICT[folder_name]['features_dict']
	feature_description = {feat: _feature_config(**desc) for feat, desc in fdict.items()}
	ds = ds.map(
	lambda pb: tf.io.parse_single_example(pb, feature_description),
	num_parallel_calls=tf.data.experimental.AUTOTUNE,
	)
	ds = ds.map(
	lambda x: {k: tf.cast(v, fdict[k]["dtype"]) for k, v in x.items()},
	num_parallel_calls=tf.data.experimental.AUTOTUNE,
	)
	write_dataset_to_file(ds, os.path.join(output_folder, split, folder_name + '.jsonl'), detokenizer, remove_newlines)


	def process_folder(data_folder, folder_name, output_folder, detokenizer, remove_newlines):
	if not os.path.isdir(os.path.join(data_folder, folder_name)):
	return
	print(f'Processing {folder_name}')
	train_fname = os.path.join(data_folder, folder_name, 'train.tfrecord-00000-of-00001')
	valid_fname = os.path.join(data_folder, folder_name, 'validation.tfrecord-00000-of-00001')
	test_fname = os.path.join(data_folder, folder_name, 'test.tfrecord-00000-of-00001')
	if not os.path.exists(train_fname):
	print(f'Could not find {train_fname}')
	return
	write_train_val_test_dataset_to_file(
	train_fname, folder_name, output_folder, detokenizer, 'train', remove_newlines
	)
	if os.path.exists(valid_fname):
	write_train_val_test_dataset_to_file(
	valid_fname, folder_name, output_folder, detokenizer, 'val', remove_newlines
	)
	if os.path.exists(test_fname):
	write_train_val_test_dataset_to_file(
	test_fname, folder_name, output_folder, detokenizer, 'test', remove_newlines
	)


	def process_all_folders(data_folder, output_folder, remove_newlines):
	detokenizer = MosesDetokenizer('en')
	assert os.path.isdir(data_folder)
	if not os.path.exists(output_folder):
	os.system(f'mkdir -p {output_folder}')
	if not os.path.exists(os.path.join(output_folder, 'train')):
	os.system(f'mkdir -p {os.path.join(output_folder, "train")}')
	if not os.path.exists(os.path.join(output_folder, 'val')):
	os.system(f'mkdir -p {os.path.join(output_folder, "val")}')
	if not os.path.exists(os.path.join(output_folder, 'test')):
	os.system(f'mkdir -p {os.path.join(output_folder, "test")}')

	print(f'Found {len(os.listdir(data_folder))} folders to process ...')
	pool_args = []
	for folder_name in os.listdir(data_folder):
	pool_args.append((data_folder, folder_name, output_folder, detokenizer, remove_newlines))
	pool = Pool()
	pool.starmap(process_folder, pool_args)


	if __name__ == '__main__':
	parser = ArgumentParser()
	parser.add_argument(
	"--p3_dataset_path",
	type=str,
	required=True,
	help="Path to raw P3 data. Should be a folder containing folders for each task. After cloning the repo this should correspond to P3/data",
	)
	parser.add_argument(
	"--jsonl_output_path",
	type=str,
	required=True,
	help="Path to output folder where JSONL files will be written.",
	)
	parser.add_argument(
	"--remove_newlines", action="store_true", help="Whether to remove newlines from the input and output.",
	)
	args = parser.parse_args()
	process_all_folders(args.p3_dataset_path, args.jsonl_output_path, args.remove_newlines)