|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Filter and clean documents: |
|
Capable to clean docs with less than 512 characters, less than |
|
256 characters and contains javascript, fix text and dataset specific |
|
cleaning like stories and realnews datasets. |
|
Program arguments have the details. |
|
""" |
|
|
|
import argparse |
|
from functools import partial |
|
import glob |
|
import ftfy |
|
import json |
|
from langdetect import detect |
|
import multiprocessing |
|
import os |
|
from pathlib import Path |
|
import re |
|
import time |
|
|
|
def process_doc(json_line, args): |
|
|
|
|
|
document = json.loads(json_line) |
|
text = document['text'] |
|
|
|
output = {'remove_512': False, 'remove_256_javascript': False, \ |
|
'remove_512_non_english': False, 'ftfy_fix_text': False, \ |
|
'general_cleaning': False} |
|
|
|
try: |
|
|
|
if "remove_512" in args.tasks: |
|
if len(text) < 512: |
|
output['remove_512'] = True |
|
return output, text, document, True |
|
|
|
|
|
if "remove_256_javascript" in args.tasks: |
|
if len(text) < 256 and 'javascript' in text.lower(): |
|
output['remove_256_javascript'] = True |
|
return output, text, document, True |
|
|
|
|
|
if "remove_512_non_english" in args.tasks: |
|
if len(text) < 512 and detect(text) != 'en': |
|
output['remove_512_non_english'] = True |
|
return output, text, document, True |
|
|
|
|
|
if "ftfy_fix_text" in args.tasks: |
|
fixed_text = ftfy.fix_text(text) |
|
output['ftfy_fix_text'] = True |
|
return output, fixed_text, document, False |
|
|
|
|
|
if "general_cleaning" in args.tasks: |
|
cleaned_text = re.sub(r" +|\b\n+ |\b\n+", " ", text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output['general_cleaning'] = True |
|
return output, cleaned_text, document, False |
|
|
|
except Exception as e: |
|
print('Error: *************************\n{}\ntext: {}'.format(e, \ |
|
text), flush=True) |
|
return output, text, document, True |
|
|
|
|
|
return output, text, document, False |
|
|
|
|
|
def process_set(args, input_file, output_f_cleaned, output_f_filtered): |
|
|
|
print(' > working on {} ...'.format(input_file), flush=True) |
|
|
|
num_docs = num_remove_512 = num_remove_java = num_remove_512_non_english \ |
|
= num_ftfy_fix_text = num_general_cleaning = 0 |
|
|
|
|
|
output_cleaned = open(output_f_cleaned, 'wb') |
|
output_filtered = open(output_f_filtered, 'wb') |
|
|
|
start_time = time.time() |
|
|
|
|
|
num_workers = 40 |
|
fin = open(input_file, 'r', encoding='utf-8') |
|
pool = multiprocessing.Pool(num_workers) |
|
process_doc_partial = partial(process_doc, args=args) |
|
processed_docs = pool.imap(process_doc_partial, fin, 500) |
|
|
|
|
|
for output, text, document, to_filter in processed_docs: |
|
num_docs += 1 |
|
|
|
num_remove_512 += 1 if output['remove_512'] else 0 |
|
num_remove_java += 1 if output['remove_256_javascript'] else 0 |
|
num_remove_512_non_english += 1 if output['remove_512_non_english'] \ |
|
else 0 |
|
num_ftfy_fix_text += 1 if output['ftfy_fix_text'] else 0 |
|
num_general_cleaning += 1 if output['general_cleaning'] else 0 |
|
|
|
document['text'] = text |
|
myjson = json.dumps(document, ensure_ascii=False) |
|
|
|
if to_filter: |
|
output_filtered.write(myjson.encode('utf-8')) |
|
output_filtered.write('\n'.encode('utf-8')) |
|
else: |
|
output_cleaned.write(myjson.encode('utf-8')) |
|
output_cleaned.write('\n'.encode('utf-8')) |
|
|
|
if num_docs % args.log_interval == 0: |
|
print(' processed {:9d} documents in {:.2f} seconds ...'.format( |
|
num_docs, time.time() - start_time), flush=True) |
|
|
|
|
|
output_cleaned.close() |
|
output_filtered.close() |
|
fin.close() |
|
|
|
|
|
print(' >> total docs: {} remove_512 {} remove_256_javascript {} '\ |
|
'remove_512_non_english {} ftfy_fix_text {} general_cleaning {}'.\ |
|
format(num_docs, num_remove_512, num_remove_java,\ |
|
num_remove_512_non_english, num_ftfy_fix_text, \ |
|
num_general_cleaning), flush=True) |
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
print('parsing the arguments ...') |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--input-files', nargs = '*', required=True, default=\ |
|
None, help = 'Input json files that needs to be'\ |
|
' cleaned') |
|
parser.add_argument('--tasks', nargs = '*', required=True, default=None,\ |
|
help = 'Tasks to perform on the input files, ' \ |
|
'such as remove_512, remove_256_javascript, ' \ |
|
'remove_512_non_english, ftfy_fix_text, and ' \ |
|
'general_cleaning. 256 or 512 means the number' \ |
|
' of characters.') |
|
|
|
parser.add_argument('--output-path', type=str, default=None, |
|
help='Directory where the output should go') |
|
parser.add_argument('--log-interval', type=int, default=100, |
|
help='Log interval') |
|
|
|
args = parser.parse_args() |
|
|
|
print('cleanup dataset ...') |
|
|
|
for input_file in args.input_files: |
|
input_filename, input_filename_ext = os.path.splitext(Path(input_file)\ |
|
.name) |
|
|
|
output_f_cleaned = os.path.join(args.output_path, input_filename + \ |
|
"_cleaned" + input_filename_ext) |
|
output_f_filtered = os.path.join(args.output_path, input_filename + \ |
|
"_filtered" + input_filename_ext) |
|
|
|
process_set(args, input_file, output_f_cleaned, output_f_filtered) |
|
|
|
print('done :-)', flush=True) |
|
|