import re import string from pyvi.ViTokenizer import tokenize import bm25s def clean_text(text): text = re.sub('<.*?>', '', text).strip() text = re.sub(r'(\s)+', r'\1', text) return text def normalize_text(text): listpunctuation = string.punctuation.replace('_', '') for i in listpunctuation: text = text.replace(i, ' ') return text.lower().strip() def process_text(text): text = clean_text(text) text = tokenize(text) text = normalize_text(text) return text