Spaces:

nielklug
/

enhg-parsing

Sleeping

File size: 2,923 Bytes

8778cfe

PTB_UNESCAPE_MAPPING = {
    "«": '"',
    "»": '"',
    "‘": "'",
    "’": "'",
    "“": '"',
    "”": '"',
    "„": '"',
    "‹": "'",
    "›": "'",
    "\u2013": "--",  # en dash
    "\u2014": "--",  # em dash
}

NO_SPACE_BEFORE = {"-RRB-", "-RCB-", "-RSB-", "''"} | set("%.,!?:;")
NO_SPACE_AFTER = {"-LRB-", "-LCB-", "-LSB-", "``", "`"} | set("$#")
NO_SPACE_BEFORE_TOKENS_ENGLISH = {"'", "'s", "'ll", "'re", "'d", "'m", "'ve"}
PTB_DASH_ESCAPED = {"-RRB-", "-RCB-", "-RSB-", "-LRB-", "-LCB-", "-LSB-", "--"}


def ptb_unescape(words):
    cleaned_words = []
    for word in words:
        word = PTB_UNESCAPE_MAPPING.get(word, word)
        # This un-escaping for / and * was not yet added for the
        # parser version in https://arxiv.org/abs/1812.11760v1
        # and related model releases (e.g. benepar_en2)
        word = word.replace("\\/", "/").replace("\\*", "*")
        # Mid-token punctuation occurs in biomedical text
        word = word.replace("-LSB-", "[").replace("-RSB-", "]")
        word = word.replace("-LRB-", "(").replace("-RRB-", ")")
        word = word.replace("-LCB-", "{").replace("-RCB-", "}")
        word = word.replace("``", '"').replace("`", "'").replace("''", '"')
        cleaned_words.append(word)
    return cleaned_words


def guess_space_after_non_english(escaped_words):
    sp_after = [True for _ in escaped_words]
    for i, word in enumerate(escaped_words):
        if i > 0 and (
            (
                word.startswith("-")
                and not any(word.startswith(x) for x in PTB_DASH_ESCAPED)
            )
            or any(word.startswith(x) for x in NO_SPACE_BEFORE)
            or word == "'"
        ):
            sp_after[i - 1] = False
        if (
            word.endswith("-") and not any(word.endswith(x) for x in PTB_DASH_ESCAPED)
        ) or any(word.endswith(x) for x in NO_SPACE_AFTER):
            sp_after[i] = False

    return sp_after


def guess_space_after(escaped_words, for_english=True):
    if not for_english:
        return guess_space_after_non_english(escaped_words)

    sp_after = [True for _ in escaped_words]
    for i, word in enumerate(escaped_words):
        if word.lower() == "n't" and i > 0:
            sp_after[i - 1] = False
        elif word.lower() == "not" and i > 0 and escaped_words[i - 1].lower() == "can":
            sp_after[i - 1] = False

        if i > 0 and (
            (
                word.startswith("-")
                and not any(word.startswith(x) for x in PTB_DASH_ESCAPED)
            )
            or any(word.startswith(x) for x in NO_SPACE_BEFORE)
            or word.lower() in NO_SPACE_BEFORE_TOKENS_ENGLISH
        ):
            sp_after[i - 1] = False
        if (
            word.endswith("-") and not any(word.endswith(x) for x in PTB_DASH_ESCAPED)
        ) or any(word.endswith(x) for x in NO_SPACE_AFTER):
            sp_after[i] = False

    return sp_after