File size: 509 Bytes
74b1bac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import re
import string
from pyvi.ViTokenizer import tokenize
import bm25s

def clean_text(text):
    text = re.sub('<.*?>', '', text).strip()
    text = re.sub(r'(\s)+', r'\1', text)
    return text

def normalize_text(text):
    listpunctuation = string.punctuation.replace('_', '')
    for i in listpunctuation:
        text = text.replace(i, ' ')
    return text.lower().strip()

def process_text(text):
    text = clean_text(text)
    text = tokenize(text)
    text = normalize_text(text)
    return text