Spaces:

nguyen1207
/

vi-en-mt-teencode-slang

Sleeping

App Files Files Community

vi-en-mt-teencode-slang / preprocessing.py

nguyen1207

initial commit

6136947 3 months ago

raw

history blame contribute delete

2.91 kB

	import re
	import unicodedata
	from string import punctuation


	def remove_emoticon(text: str):
	emoticon_pattern = re.compile(r"(:\|;\|=\|-\|@)(\)\|]\|\(\|v\|>\|<\|D\|@)+")
	text = emoticon_pattern.sub("", text)
	return text


	def remove_emoji(text: str):
	emoji_pattern = re.compile(
	"["
	"\U0001F600-\U0001F64F" # emoticons
	"\U0001F300-\U0001F5FF" # symbols & pictographs
	"\U0001F680-\U0001F6FF" # transport & map symbols
	"\U0001F1E0-\U0001F1FF" # flags (iOS)
	"\U00002500-\U00002BEF" # chinese char
	"\U00002702-\U000027B0"
	"\U000024C2-\U0001F251"
	"\U0001f926-\U0001f937"
	"\U00010000-\U0010ffff"
	"\u2640-\u2642"
	"\u2600-\u2B55"
	"\u200d"
	"\u23cf"
	"\u23e9"
	"\u231a"
	"\ufe0f" # dingbats
	"\u3030"
	"]+",
	re.UNICODE,
	)

	text = emoji_pattern.sub("", text)
	return text


	def remove_consecutive_whitespace(text: str):
	return " ".join(text.split())


	def remove_consecutive_punctuation(text: str):
	# only keep one punctuation
	pattern = re.compile(r"([%s])\1+" % re.escape(punctuation))
	return pattern.sub(r"\1", text)


	def normalize_unicode(text: str):
	return unicodedata.normalize("NFKC", text)


	def normalize_accents(text: str):
	dict_map = {
	"òa": "oà",
	"Òa": "Oà",
	"ÒA": "OÀ",
	"óa": "oá",
	"Óa": "Oá",
	"ÓA": "OÁ",
	"ỏa": "oả",
	"Ỏa": "Oả",
	"ỎA": "OẢ",
	"õa": "oã",
	"Õa": "Oã",
	"ÕA": "OÃ",
	"ọa": "oạ",
	"Ọa": "Oạ",
	"ỌA": "OẠ",
	"òe": "oè",
	"Òe": "Oè",
	"ÒE": "OÈ",
	"óe": "oé",
	"Óe": "Oé",
	"ÓE": "OÉ",
	"ỏe": "oẻ",
	"Ỏe": "Oẻ",
	"ỎE": "OẺ",
	"õe": "oẽ",
	"Õe": "Oẽ",
	"ÕE": "OẼ",
	"ọe": "oẹ",
	"Ọe": "Oẹ",
	"ỌE": "OẸ",
	"ùy": "uỳ",
	"Ùy": "Uỳ",
	"ÙY": "UỲ",
	"úy": "uý",
	"Úy": "Uý",
	"ÚY": "UÝ",
	"ủy": "uỷ",
	"Ủy": "Uỷ",
	"ỦY": "UỶ",
	"ũy": "uỹ",
	"Ũy": "Uỹ",
	"ŨY": "UỸ",
	"ụy": "uỵ",
	"Ụy": "Uỵ",
	"ỤY": "UỴ",
	}

	for k, v in dict_map.items():
	text = re.sub(k, v, text, flags=re.IGNORECASE)

	return text


	def preprocess_pipeline(text):
	text = remove_emoticon(text)

	# remove emojis
	text = remove_emoji(text)
	# normalize unicode
	text = normalize_unicode(text)

	# normalize accents
	text = normalize_accents(text)

	# remove consecutive whitespace
	text = remove_consecutive_whitespace(text)

	# remove consecutive punctuation
	text = remove_consecutive_punctuation(text)

	return text