VerVelVel commited on
Commit
961ee03
·
1 Parent(s): 60a1ebe
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ venv/
2
+ # Ignore Python cache files
3
+ __pycache__/
images/2_rubert_metrics.png ADDED
images/log_reg_metrics.png ADDED
images/nlp.jpg ADDED
models/model1/lstm_model.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import numpy as np
4
+ import torch.nn.functional as F
5
+ import joblib
6
+ from gensim.models import Word2Vec
7
+
8
+ vocab_to_int = joblib.load('models/model1/lstm_vocab_to_int.pkl')
9
+
10
+ wv = Word2Vec.load("models/model1/word2vec_model.bin")
11
+
12
+ # Определение embedding_layer
13
+ embedding_matrix = np.zeros((3379, 32))
14
+ for word, i in vocab_to_int.items():
15
+ try:
16
+ embedding_vector = wv.wv[word]
17
+ embedding_matrix[i] = embedding_vector
18
+ except KeyError as e:
19
+ pass
20
+ print(f'{e}: word: {word}')
21
+
22
+ embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
23
+
24
+ class BahdanauAttention(nn.Module):
25
+ def __init__(self, hidden_size=32):
26
+ super().__init__()
27
+ self.hidden_size = hidden_size
28
+ self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size)
29
+ self.linear_2 = nn.Linear(self.hidden_size, self.hidden_size)
30
+ self.alogn = nn.Linear(self.hidden_size, 1)
31
+ self.tanh = nn.Tanh()
32
+
33
+ def forward(self, lstm_outputs, final_hidden):
34
+
35
+ keys = self.linear_1(lstm_outputs) # keys.shape: [batch_size, seq_len, hidden_size]
36
+ query = self.linear_2(final_hidden) # query.shape: [batch_size, hidden_size]
37
+
38
+ query = query.unsqueeze(1).expand(-1, lstm_outputs.size(1), -1) # query.shape: [batch_size, seq_len, hidden_size]
39
+
40
+ keys_query = keys + query # keys_query.shape: [batch_size, seq_len, hidden_size]
41
+ att_weights = self.tanh(keys_query) # att_weights.shape: [batch_size, seq_len, hidden_size]
42
+ att_weights = self.alogn(att_weights) # att_weights.shape: [batch_size, seq_len, 1]
43
+ att_weights = F.softmax(att_weights.squeeze(2), dim=1) # att_weights.shape: [batch_size, seq_len]
44
+ # Compute the context vector
45
+ context = torch.bmm(lstm_outputs.transpose(1, 2), att_weights.unsqueeze(2)) # context.shape: [batch_size, hidden_size, 1]
46
+ context = context.squeeze(2) # context.shape: [batch_size, hidden_size]
47
+
48
+ return context, att_weights
49
+
50
+
51
+ # Определение класса модели
52
+ class LSTMConcatAttention(nn.Module):
53
+ def __init__(self):
54
+ super().__init__()
55
+
56
+ self.embedding = embedding_layer
57
+ self.lstm = nn.LSTM(32, 32, batch_first=True)
58
+ self.attn = BahdanauAttention(32)
59
+ self.clf = nn.Sequential(
60
+ nn.Linear(32, 128),
61
+ nn.Dropout(),
62
+ nn.Tanh(),
63
+ nn.Linear(128, 1)
64
+ )
65
+
66
+ def forward(self, x):
67
+ embeddings = self.embedding(x)
68
+ outputs, (h_n, _) = self.lstm(embeddings)
69
+ context, att_weights = self.attn(outputs, h_n.squeeze(0))
70
+ out = self.clf(context)
71
+ return out, att_weights
models/model1/lstm_preprocessor.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ import numpy as np
3
+ import torch
4
+ from sklearn.base import BaseEstimator, TransformerMixin
5
+ from nltk.corpus import stopwords
6
+ import joblib
7
+ import re
8
+
9
+ class TextPreprocessorWord2Vec(BaseEstimator, TransformerMixin):
10
+ def __init__(self):
11
+ self.stop_words = set(stopwords.words('russian'))
12
+ self.vocab_to_int = joblib.load('models/model1/lstm_vocab_to_int.pkl')
13
+
14
+ def preprocess_text(self, text):
15
+ # Преобразование к нижнему регистру
16
+ text = text.lower()
17
+ # Удаление HTML тегов
18
+ text = re.sub('<.*?>', '', text)
19
+ # Удаление пунктуации
20
+ text = ''.join([c for c in text if c not in string.punctuation])
21
+ # Удаление стоп-слов
22
+ text = ' '.join([word for word in text.split() if word not in self.stop_words])
23
+ # Удаление цифр
24
+ text = ' '.join([word for word in text.split() if not word.isdigit()])
25
+ return text
26
+
27
+ def padding(review_int: list, seq_len: int) -> np.array:
28
+ features = np.zeros((len(review_int), seq_len), dtype=int)
29
+ for i, review in enumerate(review_int):
30
+ if len(review) <= seq_len:
31
+ zeros = list(np.zeros(seq_len - len(review)))
32
+ new = zeros + review
33
+ else:
34
+ new = review[:seq_len]
35
+ features[i, :] = np.array(new)
36
+ return features
37
+
38
+ @staticmethod
39
+ def preprocess_single_string(
40
+ input_string: str,
41
+ seq_len: int,
42
+ vocab_to_int: dict,
43
+ verbose: bool = False
44
+ ) -> torch.tensor:
45
+ preprocessed_string = TextPreprocessorWord2Vec().preprocess_text(input_string)
46
+ result_list = []
47
+ for word in preprocessed_string.split():
48
+ try:
49
+ result_list.append(vocab_to_int[word])
50
+ except KeyError as e:
51
+ if verbose:
52
+ print(f'{e}: not in dictionary!')
53
+ pass
54
+ result_padded = TextPreprocessorWord2Vec.padding([result_list], 64)[0]
55
+ return torch.tensor(result_padded)
56
+
57
+ def fit(self, X, y=None):
58
+ return self
59
+
60
+ def transform(self, X, y=None):
61
+ return self.preprocess_single_string(X, 64, self.vocab_to_int)
models/model1/lstm_vocab_to_int.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47b13b436bb7c4727517cbff2b50e2e8f60f8bd944b963ce9af7ba07d936804d
3
+ size 66010
models/model1/lstm_weights ADDED
Binary file (498 kB). View file
 
models/model1/word2vec_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd6fbff4f54327bdf67ee352c0ed9cfbd63b2707348015f2d6d696b3231c7f10
3
+ size 993284
pages/1_policlinic.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import joblib
3
+ import pandas as pd
4
+ from models.model1.Custom_class import TextPreprocessor
5
+ from pathlib import Path
6
+ import sys
7
+ import torch
8
+ import numpy as np
9
+ import matplotlib.pyplot as plt
10
+ import time
11
+
12
+ project_root = Path(__file__).resolve().parents[1]
13
+ models_path = project_root / 'models'
14
+ sys.path.append(str(models_path))
15
+ from models.model1.lstm_preprocessor import TextPreprocessorWord2Vec
16
+ from models.model1.lstm_model import LSTMConcatAttention
17
+
18
+ # Load the trained pipeline
19
+ pipeline = joblib.load('models/model1/logistic_regression_pipeline.pkl')
20
+
21
+ # Streamlit application
22
+ st.title('Классификация отзывов на русском языке')
23
+
24
+ input_text = st.text_area('Введите текст отзыва')
25
+
26
+ device = 'cpu'
27
+
28
+ # Загрузка модели LSTM и словаря
29
+ @st.cache_resource
30
+ def load_lstm_model():
31
+ model = LSTMConcatAttention()
32
+ weights_path = models_path / 'model1' / 'lstm_weights'
33
+ state_dict = torch.load(weights_path, map_location=device)
34
+ model.load_state_dict(state_dict)
35
+ model.to(device)
36
+ model.eval()
37
+ return model
38
+
39
+ lstm_model = load_lstm_model()
40
+
41
+ @st.cache_resource
42
+ def load_int_to_vocab():
43
+ vocab_path = models_path / 'model1' / 'lstm_vocab_to_int.pkl'
44
+ vocab_to_int = joblib.load(vocab_path)
45
+ int_to_vocab = {j:i for i, j in vocab_to_int.items()}
46
+ return int_to_vocab
47
+ int_to_vocab = load_int_to_vocab()
48
+
49
+ def plot_and_predict_lstm(input_text):
50
+ preprocessor_lstm = TextPreprocessorWord2Vec()
51
+ preprocessed = preprocessor_lstm.transform(input_text)
52
+ lstm_model.eval()
53
+ with torch.inference_mode():
54
+ pred, att_scores = lstm_model(preprocessed.long().unsqueeze(0))
55
+
56
+ lstm_pred = pred.sigmoid().item()
57
+
58
+ # Получить индексы слов, которые не равны <pad> и не имеют индекс 0
59
+ valid_indices = [i for i, x in enumerate(preprocessed) if x.item() != 0 and int_to_vocab[x.item()] != "<pad>"]
60
+
61
+ # Получить соответствующие оценки внимания и метки слов
62
+ valid_att_scores = att_scores.detach().cpu().numpy()[0][valid_indices]
63
+ valid_labels = [int_to_vocab[preprocessed[i].item()] for i in valid_indices]
64
+
65
+ # Упорядочить метки и оценки внимания по убыванию веса смысла
66
+ sorted_indices = np.argsort(valid_att_scores)
67
+ sorted_labels = [valid_labels[i] for i in sorted_indices]
68
+ sorted_att_scores = valid_att_scores[sorted_indices]
69
+
70
+ # Построить график с учетом только валидных меток
71
+ plt.figure(figsize=(4, 8))
72
+ plt.barh(np.arange(len(sorted_indices)), sorted_att_scores)
73
+ plt.yticks(ticks=np.arange(len(sorted_indices)), labels=sorted_labels)
74
+
75
+ return lstm_pred, plt
76
+
77
+ if st.button('Предсказать'):
78
+ start_time_lr = time.time()
79
+ prediction = pipeline.predict(pd.Series([input_text]))
80
+ pred_probe = pipeline.predict_proba(pd.Series([input_text]))
81
+ pred_proba_rounded = np.round(pred_probe, 2).flatten()
82
+ if prediction[0] == 0:
83
+ predicted_class = "POSITIVE"
84
+ else:
85
+ predicted_class = "NEGATIVE"
86
+ st.subheader('Предсказанный класс с помощью логистической регрессии и tf-idf')
87
+ end_time_lr = time.time()
88
+ time_lr = end_time_lr - start_time_lr
89
+ st.write(f'**{predicted_class}** с вероятностью {pred_proba_rounded[0]}')
90
+ st.write(f'Время выполнения расчетов {time_lr:.4f} секунд')
91
+
92
+ start_time_lstm = time.time()
93
+ lstm_pred, lstm_plot = plot_and_predict_lstm(input_text)
94
+ if lstm_pred > 0.5:
95
+ predicted_lstm_class = "POSITIVE"
96
+ else:
97
+ predicted_lstm_class = "NEGATIVE"
98
+ st.subheader('Предсказанный класс с помощью LSTM + Word2Vec + BahdanauAttention:')
99
+ end_time_lstm = time.time()
100
+ time_lstm = end_time_lstm - start_time_lstm
101
+ st.write(f'**{predicted_lstm_class}** с вероятностью {round(lstm_pred, 3)}')
102
+ st.write(f'Время выполнения расчетов {time_lstm:.4f} секунд')
103
+ st.pyplot(lstm_plot)
104
+
105
+
106
+
107
+ st.write("# Информация об обучении модели логистической регрессии и tf-idf:")
108
+ st.image(str(project_root / 'images/pipeline_logreg.png'))
109
+ st.write("Модель обучалась на предсказание 1 класса")
110
+ st.write("Размер датасета - 70597 текстов отзывов")
111
+ st.write("Проведена предобработка текста")
112
+
113
+ st.write("Метрики:")
114
+ st.image(str(project_root / 'images/log_reg_metrics.png'))
115
+
116
+ st.write("# Информация об обучении модели LSTM + Word2Vec + BahdanauAttention:")
117
+ st.write("Время обучения модели - 10 эпох")
118
+ st.write("Метрики на 10 эпохе:")
119
+ st.write("Train f1: 0.95, Val f1: 0.93")
120
+ st.write("Train accuracy: 0.94, Val accuracy: 0.92")
121
+
pages/2_comments.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import sys
4
+ from pathlib import Path
5
+ import time
6
+ import numpy as np
7
+ from transformers import AutoTokenizer
8
+
9
+
10
+ st.write("# Оценка степени токсичности пользовательского сообщения")
11
+ # st.write("Здесь вы можете загрузить картинку со своего устройства, либо при помощи ссылки")
12
+
13
+ # Добавление пути к проекту и моделям
14
+ project_root = Path(__file__).resolve().parents[1]
15
+ models_path = project_root / 'models'
16
+ sys.path.append(str(models_path))
17
+ from models.model2.preprocess_text import TextPreprocessorBERT
18
+ from models.model2.model import BERTClassifier
19
+
20
+ device = 'cpu'
21
+
22
+ # Загрузка модели и словаря
23
+ @st.cache_resource
24
+ def load_model():
25
+ model = BERTClassifier()
26
+ weights_path = models_path / 'model2' / 'model_weights_new.pth'
27
+ state_dict = torch.load(weights_path, map_location=device)
28
+ model.load_state_dict(state_dict)
29
+ model.to(device)
30
+ model.eval()
31
+ return model
32
+
33
+ @st.cache_resource
34
+ def load_tokenizer():
35
+ return AutoTokenizer.from_pretrained('cointegrated/rubert-tiny-toxicity')
36
+
37
+ model = load_model()
38
+ tokenizer = load_tokenizer()
39
+
40
+ input_text = st.text_area('Введите текст сообщения')
41
+
42
+ if st.button('Предсказать'):
43
+ start_time = time.time()
44
+ # Применяем предобработку
45
+ preprocessor = TextPreprocessorBERT()
46
+ preprocessed_text = preprocessor.transform(input_text)
47
+
48
+ # Токенизация
49
+ tokens = tokenizer.encode_plus(
50
+ preprocessed_text,
51
+ add_special_tokens=True,
52
+ truncation=True,
53
+ max_length=100,
54
+ padding='max_length',
55
+ return_tensors='pt'
56
+ )
57
+
58
+ # Получаем input_ids и attention_mask из токенов
59
+ input_ids = tokens['input_ids'].to(device)
60
+ attention_mask = tokens['attention_mask'].to(device)
61
+
62
+ # Предсказание
63
+ with torch.no_grad():
64
+ output = model(input_ids, attention_mask=attention_mask)
65
+
66
+ # Интерпретация результата
67
+ prediction = torch.sigmoid(output).item()
68
+ end_time = time.time() # Останавливаем таймер
69
+ execution_time = end_time - start_time
70
+ if prediction > 0.5:
71
+ class_pred = 'TOXIC'
72
+ else:
73
+ class_pred = 'healthy'
74
+ st.subheader(f'Предсказанный класс токсичности: **{class_pred}** с вероятностью {prediction:.4f}')
75
+ # st.write(f'Предсказанный класс токсичности: {prediction:.4f}')
76
+ st.write(f'Время выполнения: {execution_time:.4f} секунд')
77
+
78
+
79
+
80
+ # Информация о первой модели
81
+ st.write("# Информация об обучении модели rubert-tiny-toxicity:")
82
+ st.write("Модель обучалась на предсказание 1 класса")
83
+ st.write("Размер датасета - 14412 текстов сообщений")
84
+ st.write("Проведена предобработка текста")
85
+
86
+ st.image(str(project_root / 'images/2_rubert_metrics.png'), width=1000)
87
+ st.write("Время обучения модели - 50 эпох")
88
+ st.write("Метрики на 50 эпохе:")
89
+ st.write("Train f1: 0.73, Val f1: 0.77")
90
+ st.write("Train acc: 0.73, Val acc: 0.74")
91
+
92
+
93
+
94
+
requirements.txt ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.3.0
2
+ attrs==23.2.0
3
+ blinker==1.8.2
4
+ cachetools==5.3.3
5
+ certifi==2024.2.2
6
+ charset-normalizer==3.3.2
7
+ click==8.1.7
8
+ contourpy==1.2.1
9
+ cycler==0.12.1
10
+ DAWG-Python==0.7.2
11
+ docopt==0.6.2
12
+ filelock==3.14.0
13
+ fonttools==4.52.4
14
+ fsspec==2024.5.0
15
+ gensim==4.3.1
16
+ gitdb==4.0.11
17
+ GitPython==3.1.43
18
+ huggingface-hub==0.23.2
19
+ idna==3.7
20
+ Jinja2==3.1.4
21
+ joblib==1.4.2
22
+ jsonschema==4.22.0
23
+ jsonschema-specifications==2023.12.1
24
+ kiwisolver==1.4.5
25
+ markdown-it-py==3.0.0
26
+ MarkupSafe==2.1.5
27
+ matplotlib==3.9.0
28
+ mdurl==0.1.2
29
+ mpmath==1.3.0
30
+ networkx==3.3
31
+ nltk==3.8.1
32
+ numpy==1.24.4
33
+ nvidia-cublas-cu12==12.1.3.1
34
+ nvidia-cuda-cupti-cu12==12.1.105
35
+ nvidia-cuda-nvrtc-cu12==12.1.105
36
+ nvidia-cuda-runtime-cu12==12.1.105
37
+ nvidia-cudnn-cu12==8.9.2.26
38
+ nvidia-cufft-cu12==11.0.2.54
39
+ nvidia-curand-cu12==10.3.2.106
40
+ nvidia-cusolver-cu12==11.4.5.107
41
+ nvidia-cusparse-cu12==12.1.0.106
42
+ nvidia-nccl-cu12==2.20.5
43
+ nvidia-nvjitlink-cu12==12.5.40
44
+ nvidia-nvtx-cu12==12.1.105
45
+ packaging==24.0
46
+ pandas==2.2.2
47
+ pathlib==1.0.1
48
+ pillow==10.3.0
49
+ protobuf==4.25.3
50
+ pyarrow==16.1.0
51
+ pydeck==0.9.1
52
+ Pygments==2.18.0
53
+ pymorphy2==0.9.1
54
+ pymorphy2-dicts-ru==2.4.417127.4579844
55
+ pyparsing==3.1.2
56
+ python-dateutil==2.9.0.post0
57
+ pytz==2024.1
58
+ PyYAML==6.0.1
59
+ referencing==0.35.1
60
+ regex==2024.5.15
61
+ requests==2.32.3
62
+ rich==13.7.1
63
+ rpds-py==0.18.1
64
+ safetensors==0.4.3
65
+ scikit-learn==1.5.0
66
+ scipy==1.8.1
67
+ six==1.16.0
68
+ smart-open==7.0.4
69
+ smmap==5.0.1
70
+ streamlit==1.35.0
71
+ sympy==1.12.1
72
+ tenacity==8.3.0
73
+ threadpoolctl==3.5.0
74
+ tokenizers==0.19.1
75
+ toml==0.10.2
76
+ toolz==0.12.1
77
+ torch==2.3.0
78
+ tornado==6.4
79
+ tqdm==4.66.4
80
+ transformers==4.41.2
81
+ triton==2.3.0
82
+ typing_extensions==4.12.0
83
+ tzdata==2024.1
84
+ urllib3==2.2.1
85
+ watchdog==4.0.1
86
+ wrapt==1.16.0
space.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ title: Nlp Bert Team
2
+ app_file: Hello.py