Spaces:
Sleeping
Sleeping
File size: 2,909 Bytes
6136947 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import re
import unicodedata
from string import punctuation
def remove_emoticon(text: str):
emoticon_pattern = re.compile(r"(:|;|=|-|@)(\)|]|\(|v|>|<|D|@)+")
text = emoticon_pattern.sub("", text)
return text
def remove_emoji(text: str):
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"\U00002500-\U00002BEF" # chinese char
"\U00002702-\U000027B0"
"\U000024C2-\U0001F251"
"\U0001f926-\U0001f937"
"\U00010000-\U0010ffff"
"\u2640-\u2642"
"\u2600-\u2B55"
"\u200d"
"\u23cf"
"\u23e9"
"\u231a"
"\ufe0f" # dingbats
"\u3030"
"]+",
re.UNICODE,
)
text = emoji_pattern.sub("", text)
return text
def remove_consecutive_whitespace(text: str):
return " ".join(text.split())
def remove_consecutive_punctuation(text: str):
# only keep one punctuation
pattern = re.compile(r"([%s])\1+" % re.escape(punctuation))
return pattern.sub(r"\1", text)
def normalize_unicode(text: str):
return unicodedata.normalize("NFKC", text)
def normalize_accents(text: str):
dict_map = {
"òa": "oà",
"Òa": "Oà",
"ÒA": "OÀ",
"óa": "oá",
"Óa": "Oá",
"ÓA": "OÁ",
"ỏa": "oả",
"Ỏa": "Oả",
"ỎA": "OẢ",
"õa": "oã",
"Õa": "Oã",
"ÕA": "OÃ",
"ọa": "oạ",
"Ọa": "Oạ",
"ỌA": "OẠ",
"òe": "oè",
"Òe": "Oè",
"ÒE": "OÈ",
"óe": "oé",
"Óe": "Oé",
"ÓE": "OÉ",
"ỏe": "oẻ",
"Ỏe": "Oẻ",
"ỎE": "OẺ",
"õe": "oẽ",
"Õe": "Oẽ",
"ÕE": "OẼ",
"ọe": "oẹ",
"Ọe": "Oẹ",
"ỌE": "OẸ",
"ùy": "uỳ",
"Ùy": "Uỳ",
"ÙY": "UỲ",
"úy": "uý",
"Úy": "Uý",
"ÚY": "UÝ",
"ủy": "uỷ",
"Ủy": "Uỷ",
"ỦY": "UỶ",
"ũy": "uỹ",
"Ũy": "Uỹ",
"ŨY": "UỸ",
"ụy": "uỵ",
"Ụy": "Uỵ",
"ỤY": "UỴ",
}
for k, v in dict_map.items():
text = re.sub(k, v, text, flags=re.IGNORECASE)
return text
def preprocess_pipeline(text):
text = remove_emoticon(text)
# remove emojis
text = remove_emoji(text)
# normalize unicode
text = normalize_unicode(text)
# normalize accents
text = normalize_accents(text)
# remove consecutive whitespace
text = remove_consecutive_whitespace(text)
# remove consecutive punctuation
text = remove_consecutive_punctuation(text)
return text
|