File size: 1,850 Bytes
aa7dcb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import re

import nltk
from natasha import (Doc, MorphVocab, NamesExtractor, NewsEmbedding,
                     NewsMorphTagger, NewsNERTagger, NewsSyntaxParser,
                     Segmenter)
from nltk.corpus import stopwords

nltk.download('stopwords')


class TextCleaner:

    def __init__(self, lemma: bool = True):
        self.lemma = lemma
        self.segmenter = Segmenter()
        self.morph_vocab = MorphVocab()
        emb = NewsEmbedding()
        self.morph_tagger = NewsMorphTagger(emb)
        syntax_parser = NewsSyntaxParser(emb)
        ner_tagger = NewsNERTagger(emb)
        names_extractor = NamesExtractor(self.morph_vocab)
        self.en_stops = stopwords.words('english')
        self.ru_stops = stopwords.words('russian')
        self.punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
        self.words_pattern = '[а-я]+'

    def execute(self, text):
        text = self.text_preprocessing(text)
        if self.lemma:
            text = self.lemmatize(text)
        return text

    def text_preprocessing(self, data):
        data = " ".join(x.lower() for x in data.split())
        data = data.replace('[^\w\s]', '')
        data = " ".join(x for x in data.split()
                        if x not in self.ru_stops and x not in self.en_stops)
        for punc in self.punc:
            if punc in data:
                data = data.replace(punc, "")
        data = re.sub(' +', ' ', data)
        return " ".join(
            re.findall(self.words_pattern, data, flags=re.IGNORECASE))

    def lemmatize(self, text):
        doc = Doc(text)
        doc.segment(self.segmenter)
        doc.tag_morph(self.morph_tagger)
        for token in doc.tokens:
            token.lemmatize(self.morph_vocab)
        tokens = []
        for token in doc.tokens:
            tokens.append(token.lemma)
        return " ".join(tokens)