Spaces:
Build error
Build error
File size: 1,171 Bytes
4589b64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import re
class TextPreprocess():
def __init__(self, teencode_dir="./storage/teencode.txt"):
self.get_teencode(teencode_dir)
def get_teencode(self, teencode_dir):
with open(teencode_dir, "r", encoding="utf-8") as f:
teencode_original = f.readlines()
teencode_json = {}
for teencode in teencode_original:
key, value = teencode.split("\t")
value = value.replace("\n", "")
teencode_json[key] = value
self.teencode_json = teencode_json
def teencode_normalize(self, text):
text_split = text.split()
return " ".join([self.teencode_json.get(txt, txt) for txt in text_split])
def clean_text(self, text):
# Xóa hashtag (dấu #)
text = re.sub(r'#\w+', '', text)
# Xóa liên kết (URL)
text = re.sub(r'http\S+', '', text)
# Xóa các ký tự số
text = re.sub(r'\d+', '', text)
# Xóa ký tự đặc biệt
text = re.sub(r'[^\w\s]', '', text)
text = " ".join(text.split())
text = text.lower()
return text
def preprocess(self, text):
cleaned_text = self.clean_text(text)
cleaned_text = self.teencode_normalize(cleaned_text)
return cleaned_text |