from PIL import Image, ImageFilter, ImageDraw
import streamlit as st

import pickle
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor
from dataclasses import dataclass
from typing import Union
import re
import string
import pymorphy3
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))


# ------------------------------------------------------------#
# Упрощенный метод создания класса

@dataclass
class ConfigRNN:
    vocab_size: int  # сколько слов - столько embedding-ов; для инициализации embedding параметров
    device: str
    n_layers: int
    embedding_dim: int  # чем больше, тем сложнее можно закодировать слово
    hidden_size: int
    seq_len: int
    bidirectional: Union[bool, int]


net_config = ConfigRNN(
    vocab_size=17259 + 1,  # -> hand
    device="cpu",
    n_layers=1,
    embedding_dim=8,  # не лучшее значение, но в рамках задачи сойдет
    hidden_size=16,
    seq_len=30,  # -> hand
    bidirectional=False,
)
# ------------------------------------------------------------#


class LSTMClassifier(nn.Module):
    def __init__(self, rnn_conf=net_config) -> None:
        super().__init__()

        self.embedding_dim = rnn_conf.embedding_dim
        self.hidden_size = rnn_conf.hidden_size
        self.bidirectional = rnn_conf.bidirectional
        self.n_layers = rnn_conf.n_layers

        self.embedding = nn.Embedding(rnn_conf.vocab_size, self.embedding_dim)
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.hidden_size,
            bidirectional=self.bidirectional,
            batch_first=True,
            num_layers=self.n_layers,
            dropout=0.5
        )
        self.bidirect_factor = 2 if self.bidirectional else 1
        self.clf = nn.Sequential(
            nn.Linear(self.hidden_size * self.bidirect_factor, 32),
            nn.Dropout(),
            nn.Tanh(),
            nn.Dropout(),
            nn.Linear(32, 5)  # len(df['label'].unique())
        )

    def model_description(self):
        direction = "bidirect" if self.bidirectional else "onedirect"
        return f"lstm_{direction}_{self.n_layers}"

    def forward(self, x: torch.Tensor):
        embeddings = self.embedding(x)
        out, _ = self.lstm(embeddings)
        # print(out.shape)
        # [все элементы батча, последний h_n, все элементы последнего h_n]
        out = out[:, -1, :]
        # print(out.shape)
        out = self.clf(out)
        return out
# ------------------------------------------------------------#
# Загрузка модели


@st.cache_resource
def load_model():
    model = LSTMClassifier(net_config)
    model.load_state_dict(torch.load(
        "models/lstm_weights.pth", map_location=torch.device("cpu")))
    model.eval()
    return model


model_lstm = load_model()
# ------------------------------------------------------------#


def padding(text_int: list, seq_len: int) -> np.ndarray:
    """Make left-sided padding for input list of tokens

    Args:
        review_int (list): input list of tokens
        seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros

    Returns:
        np.array: padded sequences
    """
    features = np.zeros((len(text_int), seq_len), dtype=int)
    for i, review in enumerate(text_int):
        if len(review) <= seq_len:
            zeros = list(np.zeros(seq_len - len(review)))
            new = zeros + review
        else:
            new = review[:seq_len]
        features[i, :] = np.array(new)
    return features


morph = pymorphy3.MorphAnalyzer()


def lemmatize(text):
    # Разбиваем текст на слова
    words = text.split()

    # Лемматизируем каждое слово и убираем стоп-слова
    lemmatized_words = [morph.parse(word)[0].normal_form for word in words]

    # Собираем текст из лемматизированных слов
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text


def data_preprocessing(text):
    # From Phase 1
    text = re.sub(r':[a-zA-Z]+:', '', text)  # Убираем смайлики
    text = text.lower()  # Переводим текст в нижний регистр
    text = re.sub(r'@[\w_-]+', '', text)  # Убираем упоминания пользователей
    text = re.sub(r'#(\w+)', '', text)  # Убираем хэштеги
    text = re.sub(r'\d+', '', text)  # Убираем цифры
    # Убираем ссылки
    text = re.sub(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'\s+', ' ', text)  # Убираем лишние пробелы
    # Удаление английских слов
    text = ' '.join(re.findall(r'\b[а-яА-ЯёЁ]+\b', text))
    # From Phase 2
    text = re.sub("<.*?>", "", text)  # html tags
    text = "".join([c for c in text if c not in string.punctuation])
    splitted_text = [word for word in text.split() if word not in stop_words]
    text = " ".join(splitted_text)
    return text.strip()


def preprocess_single_string(
    input_string: str,
    seq_len: int,
    vocab_to_int: dict,
    verbose: bool = False
) -> Tensor:
    """Function for all preprocessing steps on a single string

    Args:
        input_string (str): input single string for preprocessing
        seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
        vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.

    Returns:
        list: preprocessed string
    """
    preprocessed_string = lemmatize(input_string)
    preprocessed_string = data_preprocessing(input_string)
    result_list = []
    for word in preprocessed_string.split():
        try:
            result_list.append(vocab_to_int[word])
        except KeyError as e:
            if verbose:
                print(f'{e}: not in dictionary!')
            pass
    result_padded = padding([result_list], seq_len)[0]

    return Tensor(result_padded)
# ------------------------------------------------------------#


st.title("Классификация тематики новостей из телеграм каналов")
# st.write('Model summary:')
text = st.text_input('Input some news')
text_4_test = text

# Загрузка словаря из файла
with open('models/vocab_to_int.pkl', 'rb') as f:
    vocab_to_int = pickle.load(f)

if text != '':
    test_review = preprocess_single_string(
        text_4_test, net_config.seq_len, vocab_to_int)
    test_review = torch.tensor(test_review, dtype=torch.int64)
    result = torch.sigmoid(model_lstm(test_review.unsqueeze(0)))
    num = result.argmax().item()

st.write('---')
st.write('Initial text:')
st.write(text)
st.write('---')
st.write('Preprocessing:')
st.write(data_preprocessing(text))
st.write('---')
st.write('Classes:')
classes = ['крипта', 'мода', 'спорт', 'технологии', 'финансы']
st.write('крипта *', 'мода *', 'спорт *', 'технологии *', 'финансы')
st.write('---')

st.write('Predict:')
if text != '':
    st.write('Classification: ', classes[num])
    st.write('Label num: ', num)

# Загружаем изображение через PIL
image = Image.open("images/tg_metrics.png")

# Отображение
st.image(image, caption="Кошмареус переобучения", use_container_width=True)