Spaces:

nguyen1207
/

vi-en-mt-teencode-slang

Running

File size: 2,909 Bytes
import re
import unicodedata
from string import punctuation


def remove_emoticon(text: str):
    emoticon_pattern = re.compile(r"(:|;|=|-|@)(\)|]|\(|v|>|<|D|@)+")
    text = emoticon_pattern.sub("", text)
    return text


def remove_emoji(text: str):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002500-\U00002BEF"  # chinese char
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642"
        "\u2600-\u2B55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+",
        re.UNICODE,
    )

    text = emoji_pattern.sub("", text)
    return text


def remove_consecutive_whitespace(text: str):
    return " ".join(text.split())


def remove_consecutive_punctuation(text: str):
    # only keep one punctuation
    pattern = re.compile(r"([%s])\1+" % re.escape(punctuation))
    return pattern.sub(r"\1", text)


def normalize_unicode(text: str):
    return unicodedata.normalize("NFKC", text)


def normalize_accents(text: str):
    dict_map = {
        "òa": "oà",
        "Òa": "Oà",
        "ÒA": "OÀ",
        "óa": "oá",
        "Óa": "Oá",
        "ÓA": "OÁ",
        "ỏa": "oả",
        "Ỏa": "Oả",
        "ỎA": "OẢ",
        "õa": "oã",
        "Õa": "Oã",
        "ÕA": "OÃ",
        "ọa": "oạ",
        "Ọa": "Oạ",
        "ỌA": "OẠ",
        "òe": "oè",
        "Òe": "Oè",
        "ÒE": "OÈ",
        "óe": "oé",
        "Óe": "Oé",
        "ÓE": "OÉ",
        "ỏe": "oẻ",
        "Ỏe": "Oẻ",
        "ỎE": "OẺ",
        "õe": "oẽ",
        "Õe": "Oẽ",
        "ÕE": "OẼ",
        "ọe": "oẹ",
        "Ọe": "Oẹ",
        "ỌE": "OẸ",
        "ùy": "uỳ",
        "Ùy": "Uỳ",
        "ÙY": "UỲ",
        "úy": "uý",
        "Úy": "Uý",
        "ÚY": "UÝ",
        "ủy": "uỷ",
        "Ủy": "Uỷ",
        "ỦY": "UỶ",
        "ũy": "uỹ",
        "Ũy": "Uỹ",
        "ŨY": "UỸ",
        "ụy": "uỵ",
        "Ụy": "Uỵ",
        "ỤY": "UỴ",
    }

    for k, v in dict_map.items():
        text = re.sub(k, v, text, flags=re.IGNORECASE)

    return text


def preprocess_pipeline(text):
    text = remove_emoticon(text)

    # remove emojis
    text = remove_emoji(text)
    # normalize unicode
    text = normalize_unicode(text)

    # normalize accents
    text = normalize_accents(text)

    # remove consecutive whitespace
    text = remove_consecutive_whitespace(text)

    # remove consecutive punctuation
    text = remove_consecutive_punctuation(text)

    return text