import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
import pymorphy2


class DataPreprocessor:

    def __init__(self):
        nltk.download('stopwords')
        self.morph = pymorphy2.MorphAnalyzer()
        self.tokenizer = WordPunctTokenizer()
        self.punctuation = set(string.punctuation)
        self.stopwords_russian = stopwords.words("russian")
        self.stop_tokens = (set(self.stopwords_russian) - {'и', 'или', 'не'}).union(self.punctuation)

    def tokenize_data(self, texts):
        tokens = [self.tokenizer.tokenize(str(text).lower()) for text in texts]
        return tokens

    def lemmatize_tokens_string(self, tokens_string):
        new_tokens = []
        for token in tokens_string:
            if token not in self.stop_tokens:
                new_tokens.append(self.morph.parse(token)[0].normal_form)
        return new_tokens

    def lemmatize_tokens(self, tokens):
        for i in range(len(tokens)):
            tokens[i] = self.lemmatize_tokens_string(tokens[i])

    def preprocess_texts(self, texts):
        tokens = self.tokenize_data(texts)
        self.lemmatize_tokens(tokens)
        return tokens