Spaces:

Project-nlp
/

nlp-bert-team

Sleeping

App Files Files Community

VerVelVel commited on May 31, 2024

Commit

961ee03

1 Parent(s): 60a1ebe

images

Browse files

Files changed (13) hide show

.gitignore +3 -0
images/2_rubert_metrics.png +0 -0
images/log_reg_metrics.png +0 -0
images/nlp.jpg +0 -0
models/model1/lstm_model.py +71 -0
models/model1/lstm_preprocessor.py +61 -0
models/model1/lstm_vocab_to_int.pkl +3 -0
models/model1/lstm_weights +0 -0
models/model1/word2vec_model.bin +3 -0
pages/1_policlinic.py +121 -0
pages/2_comments.py +94 -0
requirements.txt +86 -0
space.yaml +2 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+venv/
+# Ignore Python cache files
+__pycache__/

images/2_rubert_metrics.png ADDED Viewed

images/log_reg_metrics.png ADDED Viewed

images/nlp.jpg ADDED Viewed

models/model1/lstm_model.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch
+from torch import nn
+import numpy as np
+import torch.nn.functional as F
+import joblib
+from gensim.models import Word2Vec
+vocab_to_int = joblib.load('models/model1/lstm_vocab_to_int.pkl')
+wv = Word2Vec.load("models/model1/word2vec_model.bin")
+# Определение embedding_layer
+embedding_matrix = np.zeros((3379, 32))
+for word, i in vocab_to_int.items():
+    try:
+        embedding_vector = wv.wv[word]
+        embedding_matrix[i] = embedding_vector
+    except KeyError as e:
+        pass
+        print(f'{e}: word: {word}')
+embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
+class BahdanauAttention(nn.Module):
+    def __init__(self, hidden_size=32):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size)
+        self.linear_2 = nn.Linear(self.hidden_size, self.hidden_size)
+        self.alogn = nn.Linear(self.hidden_size, 1)
+        self.tanh = nn.Tanh()
+    def forward(self, lstm_outputs, final_hidden):
+        keys = self.linear_1(lstm_outputs)  # keys.shape: [batch_size, seq_len, hidden_size]
+        query = self.linear_2(final_hidden)  # query.shape: [batch_size, hidden_size]
+        query = query.unsqueeze(1).expand(-1, lstm_outputs.size(1), -1)  # query.shape: [batch_size, seq_len, hidden_size]
+        keys_query = keys + query  # keys_query.shape: [batch_size, seq_len, hidden_size]
+        att_weights = self.tanh(keys_query)  # att_weights.shape: [batch_size, seq_len, hidden_size]
+        att_weights = self.alogn(att_weights)  # att_weights.shape: [batch_size, seq_len, 1]
+        att_weights = F.softmax(att_weights.squeeze(2), dim=1)  # att_weights.shape: [batch_size, seq_len]
+        # Compute the context vector
+        context = torch.bmm(lstm_outputs.transpose(1, 2), att_weights.unsqueeze(2))  # context.shape: [batch_size, hidden_size, 1]
+        context = context.squeeze(2)  # context.shape: [batch_size, hidden_size]
+        return context, att_weights
+# Определение класса модели
+class LSTMConcatAttention(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.embedding = embedding_layer
+        self.lstm = nn.LSTM(32, 32, batch_first=True)
+        self.attn = BahdanauAttention(32)
+        self.clf = nn.Sequential(
+            nn.Linear(32, 128),
+            nn.Dropout(),
+            nn.Tanh(),
+            nn.Linear(128, 1)
+        )
+    def forward(self, x):
+        embeddings = self.embedding(x)
+        outputs, (h_n, _) = self.lstm(embeddings)
+        context, att_weights = self.attn(outputs, h_n.squeeze(0))
+        out = self.clf(context)
+        return out, att_weights

models/model1/lstm_preprocessor.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import string
+import numpy as np
+import torch
+from sklearn.base import BaseEstimator, TransformerMixin
+from nltk.corpus import stopwords
+import joblib
+import re
+class TextPreprocessorWord2Vec(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        self.stop_words = set(stopwords.words('russian'))
+        self.vocab_to_int =  joblib.load('models/model1/lstm_vocab_to_int.pkl')
+    def preprocess_text(self, text):
+        # Преобразование к нижнему регистру
+        text = text.lower()
+        # Удаление HTML тегов
+        text = re.sub('<.*?>', '', text)
+        # Удаление пунктуации
+        text = ''.join([c for c in text if c not in string.punctuation])
+        # Удаление стоп-слов
+        text = ' '.join([word for word in text.split() if word not in self.stop_words])
+        # Удаление цифр
+        text = ' '.join([word for word in text.split() if not word.isdigit()])
+        return text
+    def padding(review_int: list, seq_len: int) -> np.array:
+        features = np.zeros((len(review_int), seq_len), dtype=int)
+        for i, review in enumerate(review_int):
+            if len(review) <= seq_len:
+                zeros = list(np.zeros(seq_len - len(review)))
+                new = zeros + review
+            else:
+                new = review[:seq_len]
+            features[i, :] = np.array(new)
+        return features
+    @staticmethod
+    def preprocess_single_string(
+        input_string: str,
+        seq_len: int,
+        vocab_to_int: dict,
+        verbose: bool = False
+    ) -> torch.tensor:
+        preprocessed_string = TextPreprocessorWord2Vec().preprocess_text(input_string)
+        result_list = []
+        for word in preprocessed_string.split():
+            try:
+                result_list.append(vocab_to_int[word])
+            except KeyError as e:
+                if verbose:
+                    print(f'{e}: not in dictionary!')
+                pass
+        result_padded = TextPreprocessorWord2Vec.padding([result_list], 64)[0]
+        return torch.tensor(result_padded)
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        return self.preprocess_single_string(X, 64, self.vocab_to_int)

models/model1/lstm_vocab_to_int.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47b13b436bb7c4727517cbff2b50e2e8f60f8bd944b963ce9af7ba07d936804d
+size 66010

models/model1/lstm_weights ADDED Viewed

Binary file (498 kB). View file

models/model1/word2vec_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd6fbff4f54327bdf67ee352c0ed9cfbd63b2707348015f2d6d696b3231c7f10
+size 993284

pages/1_policlinic.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import streamlit as st
+import joblib
+import pandas as pd
+from models.model1.Custom_class import TextPreprocessor
+from pathlib import Path
+import sys
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+import time
+project_root = Path(__file__).resolve().parents[1]
+models_path = project_root / 'models'
+sys.path.append(str(models_path))
+from models.model1.lstm_preprocessor import TextPreprocessorWord2Vec
+from models.model1.lstm_model import LSTMConcatAttention
+# Load the trained pipeline
+pipeline = joblib.load('models/model1/logistic_regression_pipeline.pkl')
+# Streamlit application
+st.title('Классификация отзывов на русском языке')
+input_text = st.text_area('Введите текст отзыва')
+device = 'cpu'
+# Загрузка модели LSTM и словаря
+@st.cache_resource
+def load_lstm_model():
+    model = LSTMConcatAttention()
+    weights_path = models_path / 'model1' / 'lstm_weights'
+    state_dict = torch.load(weights_path, map_location=device)
+    model.load_state_dict(state_dict)
+    model.to(device)
+    model.eval()
+    return model
+lstm_model = load_lstm_model()
+@st.cache_resource
+def load_int_to_vocab():
+    vocab_path = models_path / 'model1' / 'lstm_vocab_to_int.pkl'
+    vocab_to_int = joblib.load(vocab_path)
+    int_to_vocab =  {j:i for i, j in vocab_to_int.items()}
+    return int_to_vocab
+int_to_vocab = load_int_to_vocab()
+def plot_and_predict_lstm(input_text):
+    preprocessor_lstm = TextPreprocessorWord2Vec()
+    preprocessed = preprocessor_lstm.transform(input_text)
+    lstm_model.eval()
+    with torch.inference_mode():
+        pred, att_scores = lstm_model(preprocessed.long().unsqueeze(0))
+    lstm_pred = pred.sigmoid().item()
+    # Получить индексы слов, которые не равны <pad> и не имеют индекс 0
+    valid_indices = [i for i, x in enumerate(preprocessed) if x.item() != 0 and int_to_vocab[x.item()] != "<pad>"]
+    # Получить соответствующие оценки внимания и метки слов
+    valid_att_scores = att_scores.detach().cpu().numpy()[0][valid_indices]
+    valid_labels = [int_to_vocab[preprocessed[i].item()] for i in valid_indices]
+    # Упорядочить метки и оценки внимания по убыванию веса смысла
+    sorted_indices = np.argsort(valid_att_scores)
+    sorted_labels = [valid_labels[i] for i in sorted_indices]
+    sorted_att_scores = valid_att_scores[sorted_indices]
+    # Построить график с учетом только валидных меток
+    plt.figure(figsize=(4, 8))
+    plt.barh(np.arange(len(sorted_indices)), sorted_att_scores)
+    plt.yticks(ticks=np.arange(len(sorted_indices)), labels=sorted_labels)
+    return lstm_pred, plt
+if st.button('Предсказать'):
+    start_time_lr = time.time()
+    prediction = pipeline.predict(pd.Series([input_text]))
+    pred_probe = pipeline.predict_proba(pd.Series([input_text]))
+    pred_proba_rounded = np.round(pred_probe, 2).flatten()
+    if prediction[0] == 0:
+        predicted_class = "POSITIVE"
+    else:
+         predicted_class = "NEGATIVE"
+    st.subheader('Предсказанный класс с помощью логистической регрессии и tf-idf')
+    end_time_lr = time.time()
+    time_lr = end_time_lr - start_time_lr
+    st.write(f'**{predicted_class}** с вероятностью {pred_proba_rounded[0]}')
+    st.write(f'Время выполнения расчетов  {time_lr:.4f} секунд')
+    start_time_lstm = time.time()
+    lstm_pred, lstm_plot = plot_and_predict_lstm(input_text)
+    if lstm_pred > 0.5:
+        predicted_lstm_class = "POSITIVE"
+    else:
+         predicted_lstm_class = "NEGATIVE"
+    st.subheader('Предсказанный класс с помощью LSTM + Word2Vec + BahdanauAttention:')
+    end_time_lstm = time.time()
+    time_lstm = end_time_lstm - start_time_lstm
+    st.write(f'**{predicted_lstm_class}** с вероятностью {round(lstm_pred, 3)}')
+    st.write(f'Время выполнения расчетов  {time_lstm:.4f} секунд')
+    st.pyplot(lstm_plot)
+st.write("# Информация об обучении модели логистической регрессии и tf-idf:")
+st.image(str(project_root / 'images/pipeline_logreg.png'))
+st.write("Модель обучалась на предсказание 1 класса")
+st.write("Размер датасета - 70597 текстов отзывов")
+st.write("Проведена предобработка текста")
+st.write("Метрики:")
+st.image(str(project_root / 'images/log_reg_metrics.png'))
+st.write("# Информация об обучении модели LSTM + Word2Vec + BahdanauAttention:")
+st.write("Время обучения модели - 10 эпох")
+st.write("Метрики на 10 эпохе:")
+st.write("Train f1: 0.95, Val f1: 0.93")
+st.write("Train accuracy: 0.94, Val accuracy: 0.92")

pages/2_comments.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import streamlit as st
+import torch
+import sys
+from pathlib import Path
+import time
+import numpy as np
+from transformers import AutoTokenizer
+st.write("# Оценка степени токсичности пользовательского сообщения")
+# st.write("Здесь вы можете загрузить картинку со своего устройства, либо при помощи ссылки")
+# Добавление пути к проекту и моделям
+project_root = Path(__file__).resolve().parents[1]
+models_path = project_root / 'models'
+sys.path.append(str(models_path))
+from models.model2.preprocess_text import TextPreprocessorBERT
+from models.model2.model import BERTClassifier
+device = 'cpu'
+# Загрузка модели и словаря
+@st.cache_resource
+def load_model():
+    model = BERTClassifier()
+    weights_path = models_path / 'model2' / 'model_weights_new.pth'
+    state_dict = torch.load(weights_path, map_location=device)
+    model.load_state_dict(state_dict)
+    model.to(device)
+    model.eval()
+    return model
+@st.cache_resource
+def load_tokenizer():
+    return AutoTokenizer.from_pretrained('cointegrated/rubert-tiny-toxicity')
+model = load_model()
+tokenizer = load_tokenizer()
+input_text = st.text_area('Введите текст сообщения')
+if st.button('Предсказать'):
+    start_time = time.time()
+    # Применяем предобработку
+    preprocessor = TextPreprocessorBERT()
+    preprocessed_text = preprocessor.transform(input_text)
+    # Токенизация
+    tokens = tokenizer.encode_plus(
+        preprocessed_text,
+        add_special_tokens=True,
+        truncation=True,
+        max_length=100,
+        padding='max_length',
+        return_tensors='pt'
+    )
+    # Получаем input_ids и attention_mask из токенов
+    input_ids = tokens['input_ids'].to(device)
+    attention_mask = tokens['attention_mask'].to(device)
+    # Предсказание
+    with torch.no_grad():
+        output = model(input_ids, attention_mask=attention_mask)
+    # Интерпретация результата
+    prediction = torch.sigmoid(output).item()
+    end_time = time.time()  # Останавливаем таймер
+    execution_time = end_time - start_time
+    if prediction > 0.5:
+        class_pred = 'TOXIC'
+    else:
+        class_pred = 'healthy'
+    st.subheader(f'Предсказанный класс токсичности: **{class_pred}** с вероятностью {prediction:.4f}')
+    # st.write(f'Предсказанный класс токсичности: {prediction:.4f}')
+    st.write(f'Время выполнения: {execution_time:.4f} секунд')
+# Информация о первой модели
+st.write("# Информация об обучении модели rubert-tiny-toxicity:")
+st.write("Модель обучалась на предсказание 1 класса")
+st.write("Размер датасета - 14412 текстов сообщений")
+st.write("Проведена предобработка текста")
+st.image(str(project_root / 'images/2_rubert_metrics.png'), width=1000)
+st.write("Время обучения модели - 50 эпох")
+st.write("Метрики на 50 эпохе:")
+st.write("Train f1: 0.73, Val f1: 0.77")
+st.write("Train acc: 0.73, Val acc: 0.74")

requirements.txt ADDED Viewed

	@@ -0,0 +1,86 @@

+altair==5.3.0
+attrs==23.2.0
+blinker==1.8.2
+cachetools==5.3.3
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+contourpy==1.2.1
+cycler==0.12.1
+DAWG-Python==0.7.2
+docopt==0.6.2
+filelock==3.14.0
+fonttools==4.52.4
+fsspec==2024.5.0
+gensim==4.3.1
+gitdb==4.0.11
+GitPython==3.1.43
+huggingface-hub==0.23.2
+idna==3.7
+Jinja2==3.1.4
+joblib==1.4.2
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.0
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.3
+nltk==3.8.1
+numpy==1.24.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.5.40
+nvidia-nvtx-cu12==12.1.105
+packaging==24.0
+pandas==2.2.2
+pathlib==1.0.1
+pillow==10.3.0
+protobuf==4.25.3
+pyarrow==16.1.0
+pydeck==0.9.1
+Pygments==2.18.0
+pymorphy2==0.9.1
+pymorphy2-dicts-ru==2.4.417127.4579844
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.35.1
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+rpds-py==0.18.1
+safetensors==0.4.3
+scikit-learn==1.5.0
+scipy==1.8.1
+six==1.16.0
+smart-open==7.0.4
+smmap==5.0.1
+streamlit==1.35.0
+sympy==1.12.1
+tenacity==8.3.0
+threadpoolctl==3.5.0
+tokenizers==0.19.1
+toml==0.10.2
+toolz==0.12.1
+torch==2.3.0
+tornado==6.4
+tqdm==4.66.4
+transformers==4.41.2
+triton==2.3.0
+typing_extensions==4.12.0
+tzdata==2024.1
+urllib3==2.2.1
+watchdog==4.0.1
+wrapt==1.16.0

space.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ title: Nlp Bert Team
2	+ app_file: Hello.py