yuyan-10b / tools /openwebtext /cleanup_fix_dataset.py

Upload 21 files

1101a21 almost 2 years ago

7.23 kB

	# coding=utf-8
	# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	Filter and clean documents:
	Capable to clean docs with less than 512 characters, less than
	256 characters and contains javascript, fix text and dataset specific
	cleaning like stories and realnews datasets.
	Program arguments have the details.
	"""

	import argparse
	from functools import partial
	import glob
	import ftfy
	import json
	from langdetect import detect
	import multiprocessing
	import os
	from pathlib import Path
	import re
	import time

	def process_doc(json_line, args):

	# Read the line.
	document = json.loads(json_line)
	text = document['text']

	output = {'remove_512': False, 'remove_256_javascript': False, \
	'remove_512_non_english': False, 'ftfy_fix_text': False, \
	'general_cleaning': False}

	try:
	# Reomove all docs with less than 512 characters
	if "remove_512" in args.tasks:
	if len(text) < 512:
	output['remove_512'] = True
	return output, text, document, True

	# Remove docs if less than 256 character length and contains Javascript
	if "remove_256_javascript" in args.tasks:
	if len(text) < 256 and 'javascript' in text.lower():
	output['remove_256_javascript'] = True
	return output, text, document, True

	# Remove docs < 512 and nonenglish
	if "remove_512_non_english" in args.tasks:
	if len(text) < 512 and detect(text) != 'en':
	output['remove_512_non_english'] = True
	return output, text, document, True

	# Fix the text using ftfy, don't remove the text, hence return False
	if "ftfy_fix_text" in args.tasks:
	fixed_text = ftfy.fix_text(text)
	output['ftfy_fix_text'] = True
	return output, fixed_text, document, False

	# Cleaning extra spaces and newlines
	if "general_cleaning" in args.tasks:
	cleaned_text = re.sub(r" +\|\b\n+ \|\b\n+", " ", text)
	#cleaned_text = re.sub(r"\n\n+", "\n\n", text) # used this for Gutenberg dataset
	#cleaned_text = re.sub(r"\n", "\n\n", text) # Used this for realnews

	# stories datasets
	#cleaned_text = re.sub(r" \'", "'", text)
	#cleaned_text = re.sub(r" \!", "!", cleaned_text)
	#cleaned_text = re.sub(r" \.", ".", cleaned_text)
	#cleaned_text = re.sub(r" \?", "?", cleaned_text)
	#cleaned_text = re.sub(r" - ", "-", cleaned_text)
	##cleaned_text = re.sub(r"\" ", "\"", cleaned_text)
	#cleaned_text = re.sub(r" @ ", "@", cleaned_text)

	output['general_cleaning'] = True
	return output, cleaned_text, document, False

	except Exception as e:
	print('Error: *************************\n{}\ntext: {}'.format(e, \
	text), flush=True)
	return output, text, document, True

	# don't remove
	return output, text, document, False


	def process_set(args, input_file, output_f_cleaned, output_f_filtered):

	print(' > working on {} ...'.format(input_file), flush=True)

	num_docs = num_remove_512 = num_remove_java = num_remove_512_non_english \
	= num_ftfy_fix_text = num_general_cleaning = 0

	# Output file and counters.
	output_cleaned = open(output_f_cleaned, 'wb')
	output_filtered = open(output_f_filtered, 'wb')

	start_time = time.time()

	# Setup multi-processing.
	num_workers = 40
	fin = open(input_file, 'r', encoding='utf-8')
	pool = multiprocessing.Pool(num_workers)
	process_doc_partial = partial(process_doc, args=args)
	processed_docs = pool.imap(process_doc_partial, fin, 500)

	# Process documents.
	for output, text, document, to_filter in processed_docs:
	num_docs += 1

	num_remove_512 += 1 if output['remove_512'] else 0
	num_remove_java += 1 if output['remove_256_javascript'] else 0
	num_remove_512_non_english += 1 if output['remove_512_non_english'] \
	else 0
	num_ftfy_fix_text += 1 if output['ftfy_fix_text'] else 0
	num_general_cleaning += 1 if output['general_cleaning'] else 0

	document['text'] = text
	myjson = json.dumps(document, ensure_ascii=False)

	if to_filter:
	output_filtered.write(myjson.encode('utf-8'))
	output_filtered.write('\n'.encode('utf-8'))
	else:
	output_cleaned.write(myjson.encode('utf-8'))
	output_cleaned.write('\n'.encode('utf-8'))

	if num_docs % args.log_interval == 0:
	print(' processed {:9d} documents in {:.2f} seconds ...'.format(
	num_docs, time.time() - start_time), flush=True)

	# Close the file.
	output_cleaned.close()
	output_filtered.close()
	fin.close()

	# Print stats.
	print(' >> total docs: {} remove_512 {} remove_256_javascript {} '\
	'remove_512_non_english {} ftfy_fix_text {} general_cleaning {}'.\
	format(num_docs, num_remove_512, num_remove_java,\
	num_remove_512_non_english, num_ftfy_fix_text, \
	num_general_cleaning), flush=True)

	if __name__ == '__main__':


	print('parsing the arguments ...')

	parser = argparse.ArgumentParser()
	parser.add_argument('--input-files', nargs = '*', required=True, default=\
	None, help = 'Input json files that needs to be'\
	' cleaned')
	parser.add_argument('--tasks', nargs = '*', required=True, default=None,\
	help = 'Tasks to perform on the input files, ' \
	'such as remove_512, remove_256_javascript, ' \
	'remove_512_non_english, ftfy_fix_text, and ' \
	'general_cleaning. 256 or 512 means the number' \
	' of characters.')

	parser.add_argument('--output-path', type=str, default=None,
	help='Directory where the output should go')
	parser.add_argument('--log-interval', type=int, default=100,
	help='Log interval')

	args = parser.parse_args()

	print('cleanup dataset ...')

	for input_file in args.input_files:
	input_filename, input_filename_ext = os.path.splitext(Path(input_file)\
	.name)

	output_f_cleaned = os.path.join(args.output_path, input_filename + \
	"_cleaned" + input_filename_ext)
	output_f_filtered = os.path.join(args.output_path, input_filename + \
	"_filtered" + input_filename_ext)

	process_set(args, input_file, output_f_cleaned, output_f_filtered)

	print('done :-)', flush=True)