Graduation / pipelines /BM25 /preprocess.py
DuyTa's picture
Upload folder using huggingface_hub
74b1bac verified
raw
history blame
509 Bytes
import re
import string
from pyvi.ViTokenizer import tokenize
import bm25s
def clean_text(text):
text = re.sub('<.*?>', '', text).strip()
text = re.sub(r'(\s)+', r'\1', text)
return text
def normalize_text(text):
listpunctuation = string.punctuation.replace('_', '')
for i in listpunctuation:
text = text.replace(i, ' ')
return text.lower().strip()
def process_text(text):
text = clean_text(text)
text = tokenize(text)
text = normalize_text(text)
return text