File size: 4,310 Bytes

dcc5cd1

import argparse
import unicodedata
import re
from tqdm import tqdm

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#

import re
import unicodedata

PUNCTS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~«»'
UNICODE_PUNCT = {
    "，": ",",
    "。": ".",
    "、": ",",
    "„": '"',
    "”": '"',
    "“": '"',
    "«": '"',
    "»": '"',
    "１": '"',
    "」": '"',
    "「": '"',
    "《": '"',
    "》": '"',
    "´": "'",
    "∶": ":",
    "：": ":",
    "？": "?",
    "！": "!",
    "（": "(",
    "）": ")",
    "；": ";",
    "–": "-",
    "—": " - ",
    "．": ". ",
    "～": "~",
    "’": "'",
    "…": "...",
    "━": "-",
    "〈": "<",
    "〉": ">",
    "【": "[",
    "】": "]",
    "％": "%",
    "►": "-",
    "■": " ",  # added for Mimir
}

UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]")


def replace_unicode_punct(text: str) -> str:
    return "".join(UNICODE_PUNCT.get(c, c) for c in text)


def remove_unicode_punct(text: str) -> str:
    """More aggressive version of replace_unicode_punct but also faster."""
    return UNICODE_PUNCT_RE.sub("", text)


def strip_accents(line: str) -> str:
    """Strips accents from a piece of text."""
    nfd = unicodedata.normalize("NFD", line)
    output = [c for c in nfd if unicodedata.category(c) != "Mn"]
    if len(output) == line:
        return line
    return "".join(output)


# Build a regex matching all control characters.
NON_PRINTING_CHARS_RE = re.compile(
    f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
)
DIGIT_RE = re.compile(r"\d")
PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(
    (UNICODE_PUNCT_RE.pattern + NON_PRINTING_CHARS_RE.pattern).replace("][", "")
)


def remove_non_printing_char(text: str) -> str:
    return NON_PRINTING_CHARS_RE.sub("", text)


def normalize(line: str, accent=True, case=True, numbers=True, punct=1) -> str:
    line = line.strip()
    if not line:
        return line
    if case:
        line = line.lower()
    if accent:
        line = strip_accents(line)
    if numbers:
        line = DIGIT_RE.sub("0", line)
    if punct == 1:
        line = replace_unicode_punct(line)
    elif punct == 2:
        line = remove_unicode_punct(line)
    line = remove_non_printing_char(line)
    return line


def slow_normalize_for_dedup(line: str) -> str:
    return normalize(line, accent=False, case=True, numbers=True, punct=2)


def normalize_for_dedup(line: str) -> str:
    line = line.strip()
    if not line:
        return line
    # case
    line = line.lower()
    # numbers
    line = DIGIT_RE.sub("0", line)
    line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line)
    return line

## START OF MIMIR CODE
def normalize_text(line):
    normalized_line = unicodedata.normalize('NFKC', line).lower()

    # Add a trailing dot if the line does not end with a punctuation mark
    normalized_line = normalized_line.rstrip()
    if  normalized_line and normalized_line[-1] not in PUNCTS:
        normalized_line += '.'

    # Replace newline characters with spaces (if any remain)
    # normalized_line = re.sub(r'\r\n|\r|\n', ' ', normalized_line)
    normalized_line = normalize(normalized_line, accent=False, case=True, numbers=True, punct=1)
    return normalized_line


def normalize_file(input_file, output_file, cutoff=None):
    with (open(output_file, 'w', encoding='utf-8') as f,
          open(input_file, 'r', encoding='utf-8') as lines):
        for line_count, line in tqdm(enumerate(lines), desc="Processing"):
            f.write(normalize_text(line) + "\n")
            if cutoff and line_count >= cutoff:
                break


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Normalize text file line by line, ensure trailing punctuation, replace newlines with spaces, and show progress.')
    parser.add_argument('input_file', type=str, help='Input file path')
    parser.add_argument('output_file', type=str, help='Output file path')
    parser.add_argument('--cutoff', required=False, type=int, help='Max number of lines to process')

    args = parser.parse_args()

    normalize_file(args.input_file, args.output_file, args.cutoff)