TreatyClassifier / preprocessing.py
Xrenya's picture
Upload 8 files
aa7dcb7
import re
import nltk
from natasha import (Doc, MorphVocab, NamesExtractor, NewsEmbedding,
NewsMorphTagger, NewsNERTagger, NewsSyntaxParser,
Segmenter)
from nltk.corpus import stopwords
nltk.download('stopwords')
class TextCleaner:
def __init__(self, lemma: bool = True):
self.lemma = lemma
self.segmenter = Segmenter()
self.morph_vocab = MorphVocab()
emb = NewsEmbedding()
self.morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
names_extractor = NamesExtractor(self.morph_vocab)
self.en_stops = stopwords.words('english')
self.ru_stops = stopwords.words('russian')
self.punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
self.words_pattern = '[а-я]+'
def execute(self, text):
text = self.text_preprocessing(text)
if self.lemma:
text = self.lemmatize(text)
return text
def text_preprocessing(self, data):
data = " ".join(x.lower() for x in data.split())
data = data.replace('[^\w\s]', '')
data = " ".join(x for x in data.split()
if x not in self.ru_stops and x not in self.en_stops)
for punc in self.punc:
if punc in data:
data = data.replace(punc, "")
data = re.sub(' +', ' ', data)
return " ".join(
re.findall(self.words_pattern, data, flags=re.IGNORECASE))
def lemmatize(self, text):
doc = Doc(text)
doc.segment(self.segmenter)
doc.tag_morph(self.morph_tagger)
for token in doc.tokens:
token.lemmatize(self.morph_vocab)
tokens = []
for token in doc.tokens:
tokens.append(token.lemma)
return " ".join(tokens)