from PIL import Image, ImageFilter, ImageDraw import streamlit as st import pickle import numpy as np import torch import torch.nn as nn from torch import Tensor from dataclasses import dataclass from typing import Union import re import string import pymorphy3 from nltk.corpus import stopwords import nltk nltk.download('stopwords') stop_words = set(stopwords.words("english")) # ------------------------------------------------------------# # Упрощенный метод создания класса @dataclass class ConfigRNN: vocab_size: int # сколько слов - столько embedding-ов; для инициализации embedding параметров device: str n_layers: int embedding_dim: int # чем больше, тем сложнее можно закодировать слово hidden_size: int seq_len: int bidirectional: Union[bool, int] net_config = ConfigRNN( vocab_size=17259 + 1, # -> hand device="cpu", n_layers=1, embedding_dim=8, # не лучшее значение, но в рамках задачи сойдет hidden_size=16, seq_len=30, # -> hand bidirectional=False, ) # ------------------------------------------------------------# class LSTMClassifier(nn.Module): def __init__(self, rnn_conf=net_config) -> None: super().__init__() self.embedding_dim = rnn_conf.embedding_dim self.hidden_size = rnn_conf.hidden_size self.bidirectional = rnn_conf.bidirectional self.n_layers = rnn_conf.n_layers self.embedding = nn.Embedding(rnn_conf.vocab_size, self.embedding_dim) self.lstm = nn.LSTM( input_size=self.embedding_dim, hidden_size=self.hidden_size, bidirectional=self.bidirectional, batch_first=True, num_layers=self.n_layers, dropout=0.5 ) self.bidirect_factor = 2 if self.bidirectional else 1 self.clf = nn.Sequential( nn.Linear(self.hidden_size * self.bidirect_factor, 32), nn.Dropout(), nn.Tanh(), nn.Dropout(), nn.Linear(32, 5) # len(df['label'].unique()) ) def model_description(self): direction = "bidirect" if self.bidirectional else "onedirect" return f"lstm_{direction}_{self.n_layers}" def forward(self, x: torch.Tensor): embeddings = self.embedding(x) out, _ = self.lstm(embeddings) # print(out.shape) # [все элементы батча, последний h_n, все элементы последнего h_n] out = out[:, -1, :] # print(out.shape) out = self.clf(out) return out # ------------------------------------------------------------# # Загрузка модели @st.cache_resource def load_model(): model = LSTMClassifier(net_config) model.load_state_dict(torch.load( "models/lstm_weights.pth", map_location=torch.device("cpu"))) model.eval() return model model_lstm = load_model() # ------------------------------------------------------------# def padding(text_int: list, seq_len: int) -> np.ndarray: """Make left-sided padding for input list of tokens Args: review_int (list): input list of tokens seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros Returns: np.array: padded sequences """ features = np.zeros((len(text_int), seq_len), dtype=int) for i, review in enumerate(text_int): if len(review) <= seq_len: zeros = list(np.zeros(seq_len - len(review))) new = zeros + review else: new = review[:seq_len] features[i, :] = np.array(new) return features morph = pymorphy3.MorphAnalyzer() def lemmatize(text): # Разбиваем текст на слова words = text.split() # Лемматизируем каждое слово и убираем стоп-слова lemmatized_words = [morph.parse(word)[0].normal_form for word in words] # Собираем текст из лемматизированных слов lemmatized_text = ' '.join(lemmatized_words) return lemmatized_text def data_preprocessing(text): # From Phase 1 text = re.sub(r':[a-zA-Z]+:', '', text) # Убираем смайлики text = text.lower() # Переводим текст в нижний регистр text = re.sub(r'@[\w_-]+', '', text) # Убираем упоминания пользователей text = re.sub(r'#(\w+)', '', text) # Убираем хэштеги text = re.sub(r'\d+', '', text) # Убираем цифры # Убираем ссылки text = re.sub( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) text = re.sub(r'\s+', ' ', text) # Убираем лишние пробелы # Удаление английских слов text = ' '.join(re.findall(r'\b[а-яА-ЯёЁ]+\b', text)) # From Phase 2 text = re.sub("<.*?>", "", text) # html tags text = "".join([c for c in text if c not in string.punctuation]) splitted_text = [word for word in text.split() if word not in stop_words] text = " ".join(splitted_text) return text.strip() def preprocess_single_string( input_string: str, seq_len: int, vocab_to_int: dict, verbose: bool = False ) -> Tensor: """Function for all preprocessing steps on a single string Args: input_string (str): input single string for preprocessing seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int. Returns: list: preprocessed string """ preprocessed_string = lemmatize(input_string) preprocessed_string = data_preprocessing(input_string) result_list = [] for word in preprocessed_string.split(): try: result_list.append(vocab_to_int[word]) except KeyError as e: if verbose: print(f'{e}: not in dictionary!') pass result_padded = padding([result_list], seq_len)[0] return Tensor(result_padded) # ------------------------------------------------------------# st.title("Классификация тематики новостей из телеграм каналов") # st.write('Model summary:') text = st.text_input('Input some news') text_4_test = text # Загрузка словаря из файла with open('models/vocab_to_int.pkl', 'rb') as f: vocab_to_int = pickle.load(f) if text != '': test_review = preprocess_single_string( text_4_test, net_config.seq_len, vocab_to_int) test_review = torch.tensor(test_review, dtype=torch.int64) result = torch.sigmoid(model_lstm(test_review.unsqueeze(0))) num = result.argmax().item() st.write('---') st.write('Initial text:') st.write(text) st.write('---') st.write('Preprocessing:') st.write(data_preprocessing(text)) st.write('---') st.write('Classes:') classes = ['крипта', 'мода', 'спорт', 'технологии', 'финансы'] st.write('крипта *', 'мода *', 'спорт *', 'технологии *', 'финансы') st.write('---') st.write('Predict:') if text != '': st.write('Classification: ', classes[num]) st.write('Label num: ', num) # Загружаем изображение через PIL image = Image.open("images/tg_metrics.png") # Отображение st.image(image, caption="Кошмареус переобучения", use_container_width=True)