VerVelVel commited on
Commit
ecbd4e2
·
1 Parent(s): 087390d

second page

Browse files
models/model2/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (155 Bytes). View file
 
models/model2/__pycache__/model.cpython-310.pyc ADDED
Binary file (1.16 kB). View file
 
models/model2/__pycache__/preprocess_text.cpython-310.pyc ADDED
Binary file (1.62 kB). View file
 
models/model2/preprocess_text.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.base import BaseEstimator, TransformerMixin
5
+ import nltk
6
+ from nltk.corpus import stopwords
7
+ import string
8
+
9
+ nltk.download('stopwords')
10
+ stop_words = set(stopwords.words("russian"))
11
+
12
+ class TextPreprocessorBERT(BaseEstimator, TransformerMixin):
13
+ def __init__(self):
14
+ self.stop_words = set(stopwords.words('russian'))
15
+
16
+
17
+ def clean_text(self, text):
18
+ # Удаление всего, что не является буквами или знаками препинания
19
+ clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]')
20
+ text = clean_pattern.sub('', text)
21
+ url_pattern = re.compile(r'http\S+|www\S+|https\S+')
22
+ text = url_pattern.sub(r'', text)
23
+ text = re.sub("\s+", " ", text)
24
+ splitted_text = [word for word in text.split() if word not in stop_words]
25
+ text = " ".join(splitted_text)
26
+ return text
27
+
28
+ def fit(self, text):
29
+ return self
30
+
31
+ def transform(self, text):
32
+ # return X.apply(self.clean_text)
33
+ return self.clean_text(text)
models/sds DELETED
File without changes
notebooks/BERT_toxic.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":5036,"status":"ok","timestamp":1717067779656,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"oiCyNkXhltM1"},"outputs":[],"source":["import numpy as np\n","import pandas as pd\n","from sklearn.model_selection import train_test_split\n","\n","from sklearn.model_selection import cross_val_score\n","import torch\n","from torch import nn\n","import matplotlib.pyplot as plt\n","\n","# импортируем трансформеры\n","import transformers\n","import warnings\n","warnings.filterwarnings('ignore')\n","import re"]},{"cell_type":"markdown","metadata":{"id":"Mx4_RYe_N10x"},"source":["#Messages"]},{"cell_type":"code","execution_count":2,"metadata":{"executionInfo":{"elapsed":922,"status":"ok","timestamp":1717067833296,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"hQbBScPWa2CC"},"outputs":[],"source":["df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/aux/labeled(1).csv')"]},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":424},"executionInfo":{"elapsed":389,"status":"ok","timestamp":1717067837646,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"c7OBdZAXN-bJ","outputId":"5e34663c-50d2-4648-ec7a-1f9667320e38"},"outputs":[{"data":{"application/vnd.google.colaboratory.intrinsic+json":{"summary":"{\n \"name\": \"df\",\n \"rows\": 14412,\n \"fields\": [\n {\n \"column\": \"comment\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 14412,\n \"samples\": [\n \"\\u0431\\u0435\\u0437\\u0440\\u043e\\u0434\\u043d\\u044b\\u0439 \\u043f\\u043e\\u0442\\u043e\\u043c\\u043e\\u043a \\u0445\\u043e\\u043b\\u043e\\u043f\\u0430 \\u0440\\u0430\\u0441\\u0441\\u0443\\u0436\\u0434\\u0430\\u0435\\u0442 \\u043e \\u043d\\u0430\\u0446\\u0438\\u043e\\u043d\\u0430\\u043b\\u044c\\u043d\\u043e\\u0439 \\u0433\\u043e\\u0440\\u0434\\u043e\\u0441\\u0442\\u0438.\\n\",\n \"\\u0418\\u043d\\u0442\\u0435\\u0440\\u0435\\u0441\\u043d\\u0430\\u044f \\u0442\\u0435\\u043c\\u0430, \\u043e\\u0434\\u043d\\u0430\\u043a\\u043e. \\u041e\\u0422\\u041f \\u0432\\u0440\\u043e\\u0434\\u0435 \\u0432\\u0435\\u043d\\u0433\\u0435\\u0440\\u0441\\u043a\\u0438\\u0439 \\u0431\\u0430\\u043d\\u043a, \\u0432 \\u0412\\u0435\\u043d\\u0433\\u0440\\u0438\\u0438 \\u043e\\u043d \\u0441\\u0430\\u043c\\u044b\\u0439 \\u043f\\u043e\\u043f\\u0443\\u043b\\u044f\\u0440\\u043d\\u044b\\u0439, \\u0443 \\u043c\\u0435\\u043d\\u044f \\u0443 \\u0441\\u0430\\u043c\\u043e\\u0433\\u043e \\u0435\\u0433\\u043e \\u0441\\u0447\\u0451\\u0442 \\u0438 \\u043a\\u0430\\u0440\\u0442\\u0430, \\u0438\\u0431\\u043e \\u0443 \\u043d\\u0435\\u0433\\u043e \\u0434\\u043e\\u0433\\u043e\\u0432\\u043e\\u0440 \\u0441 \\u0443\\u043d\\u0438\\u0432\\u0435\\u0440\\u043e\\u043c, \\u0441\\u043a\\u0438\\u0434\\u043a\\u0438-\\u043f\\u043b\\u044e\\u0448\\u043a\\u0438-\\u0432\\u043e\\u0437\\u0432\\u0440\\u0430\\u0442 \\u0438 \\u0442.\\u043f. \\u0434\\u043b\\u044f \\u0441\\u0442\\u0443\\u0434\\u0435\\u043d\\u0442\\u043e\\u0432. \\u0418 \\u0437\\u0430 4 \\u0433\\u043e\\u0434\\u0430 \\u043f\\u043e\\u043b\\u044c\\u0437\\u043e\\u0432\\u0430\\u043d\\u0438\\u044f \\u043d\\u0438 \\u043e\\u0434\\u043d\\u043e\\u0439 \\u043f\\u0440\\u043e\\u0431\\u043b\\u0435\\u043c\\u044b, \\u043d\\u0438 \\u043e\\u0434\\u043d\\u043e\\u0439 \\u043f\\u043e\\u0434\\u043a\\u043b\\u044e\\u0447\\u0451\\u043d\\u043d\\u043e\\u0439 \\u0443\\u0441\\u043b\\u0443\\u0433\\u0438, \\u0431\\u043b\\u043e\\u043a\\u0438\\u0440\\u043e\\u0432\\u0430\\u043d\\u0438\\u044f, \\u0441\\u043f\\u0438\\u0441\\u0430\\u043d\\u0438\\u044f \\u043d\\u0438 \\u0437\\u0430 \\u0447\\u0442\\u043e \\u0438 \\u043f\\u043e\\u0434\\u043e\\u0431\\u043d\\u043e\\u0439 \\u0435\\u0440\\u0435\\u0441\\u0438, \\u043a\\u043e\\u0442\\u043e\\u0440\\u043e\\u0439 \\u0441\\u0442\\u0440\\u0430\\u0434\\u0430\\u044e\\u0442 \\u0432\\u0441\\u0435 \\u0440\\u043e\\u0441\\u0441\\u0438\\u0439\\u0441\\u043a\\u0438\\u0435 \\u0431\\u0430\\u043d\\u043a\\u0438. \\u041d\\u0438\\u043a\\u0430\\u043a\\u0438\\u0445 \\u043a\\u0440\\u0435\\u0434\\u0438\\u0442\\u043d\\u044b\\u0445 \\u043a\\u0430\\u0440\\u0442 \\u0434\\u0430\\u0436\\u0435 \\u043d\\u0435 \\u043f\\u0440\\u0435\\u0434\\u043b\\u0430\\u0433\\u0430\\u044e\\u0442 (\\u0438 \\u0432\\u043e\\u043e\\u0431\\u0449\\u0435 \\u043e \\u043f\\u043e\\u0434\\u043e\\u0431\\u043d\\u044b\\u0445 \\u0438\\u0441\\u0442\\u043e\\u0440\\u0438\\u044f\\u0445 \\u043d\\u0435 \\u0441\\u043b\\u044b\\u0448\\u0430\\u043b). \\u0412\\u044b\\u0445\\u043e\\u0434\\u0438\\u0442, \\u0431\\u0430\\u043d\\u043a \\u0442\\u043e\\u0442 \\u0436\\u0435, \\u0430 \\u043f\\u0440\\u0438\\u043d\\u0446\\u0438\\u043f \\u0440\\u0430\\u0431\\u043e\\u0442\\u044b \\u0434\\u0440\\u0443\\u0433\\u043e\\u0439, \\u0437\\u0430\\u0442\\u043e\\u0447\\u0435\\u043d\\u043d\\u044b\\u0439 \\u043f\\u043e\\u0434 \\u0440\\u043e\\u0441\\u0441\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439 \\u043a\\u043b\\u0438\\u0435\\u043d\\u0442 - \\u043b\\u043e\\u0445 . P.S. \\u0412 \\u0412\\u0435\\u043d\\u0433\\u0440\\u0438\\u0438 \\u0432\\u0441\\u0435 \\u0431\\u0430\\u043d\\u043a\\u0438 \\u0448\\u043b\\u044e\\u0442 \\u043a\\u0430\\u0440\\u0442\\u044b \\u043f\\u043e \\u043f\\u043e\\u0447\\u0442\\u0435, \\u043c\\u043e\\u044f \\u043a\\u0430\\u043a \\u0440\\u0430\\u0437 \\u0432 \\u044d\\u0442\\u043e\\u043c \\u043c\\u0435\\u0441\\u044f\\u0446\\u0435 \\u043f\\u0440\\u0438\\u0448\\u043b\\u0430. \\u041d\\u043e \\u0432 \\u043f\\u043e\\u0447\\u0442\\u043e\\u0432\\u044b\\u0439 \\u044f\\u0449\\u0438\\u043a \\u0438\\u0445 \\u043d\\u0438 \\u0437\\u0430 \\u0447\\u0442\\u043e \\u043d\\u0435 \\u043a\\u0438\\u043d\\u0443\\u0442. \\u041b\\u0438\\u0431\\u043e \\u0432 \\u0440\\u0443\\u043a\\u0438, \\u043b\\u0438\\u0431\\u043e \\u0438\\u0437\\u0432\\u0435\\u0449\\u0435\\u043d\\u0438\\u0435 \\u0447\\u0442\\u043e\\u0431 \\u0437\\u0430\\u0431\\u0440\\u0430\\u043b \\u043d\\u0430 \\u043f\\u043e\\u0447\\u0442\\u0435. \\u0410 \\u043f\\u043e\\u0447\\u0442\\u0430 \\u043f\\u043e\\u0440\\u044f\\u0434\\u043e\\u0447\\u043d\\u0430\\u044f\\n\",\n \"\\u0421\\u0443\\u0442\\u044c \\u0442\\u0440\\u0435\\u0431\\u043e\\u0432\\u0430\\u043d\\u0438\\u044f \\u043f\\u0440\\u043e \\u041230 - \\u043d\\u0435 \\u043f\\u0440\\u043e\\u0447\\u043d\\u043e\\u0441\\u0442\\u044c, \\u0430 \\u0432\\u043e\\u0434\\u043e\\u043d\\u0435\\u043f\\u0440\\u043e\\u043d\\u0438\\u0446\\u0430\\u0435\\u043c\\u043e\\u0441\\u0442\\u044c. \\u0412\\u043e\\u0434\\u043e\\u043d\\u0435\\u043f\\u0440\\u043e\\u043d\\u0438\\u0446\\u0430\\u0435\\u043c\\u043e\\u0441\\u0442\\u044c \\u0432\\u043e\\u043e\\u0431\\u0449\\u0435 \\u0442\\u043e \\u043e\\u0431\\u043e\\u0437\\u043d\\u0430\\u0447\\u0430\\u0435\\u0442\\u0441\\u044f W, \\u043d\\u0443 \\u0434\\u0430 \\u043d\\u0435 \\u0441\\u0443\\u0442\\u044c, \\u0437\\u0430\\u0447\\u0435\\u043c \\u043c\\u043d\\u0435 \\u043e\\u043d\\u0430 \\u0432 \\u043b\\u0435\\u043d\\u0442\\u0435 \\u043f\\u043e\\u0434 \\u0437\\u0435\\u043c\\u043b\\u0451\\u0439 ?\\n\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"toxic\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47195781877088117,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}","type":"dataframe","variable_name":"df"},"text/html":["\n"," <div id=\"df-9653c47c-7eb5-4417-b86a-e90940ea3a3a\" class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>comment</th>\n"," <th>toxic</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Верблюдов-то за что? Дебилы, бл...\\n</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Хохлы, это отдушина затюканого россиянина, мол...</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Собаке - собачья смерть\\n</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>Страницу обнови, дебил. Это тоже не оскорблени...</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>тебя не убедил 6-страничный пдф в том, что Скр...</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>14407</th>\n"," <td>Вонючий совковый скот прибежал и ноет. А вот и...</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>14408</th>\n"," <td>А кого любить? Гоблина тупорылого что-ли? Или ...</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>14409</th>\n"," <td>Посмотрел Утомленных солнцем 2. И оказалось, ч...</td>\n"," <td>0.0</td>\n"," </tr>\n"," <tr>\n"," <th>14410</th>\n"," <td>КРЫМОТРЕД НАРУШАЕТ ПРАВИЛА РАЗДЕЛА Т.К В НЕМ Н...</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>14411</th>\n"," <td>До сих пор пересматриваю его видео. Орамбо кст...</td>\n"," <td>0.0</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>14412 rows × 2 columns</p>\n","</div>\n"," <div class=\"colab-df-buttons\">\n","\n"," <div class=\"colab-df-container\">\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-9653c47c-7eb5-4417-b86a-e90940ea3a3a')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n","\n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n"," <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n"," </svg>\n"," </button>\n","\n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," .colab-df-buttons div {\n"," margin-bottom: 4px;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-9653c47c-7eb5-4417-b86a-e90940ea3a3a button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-9653c47c-7eb5-4417-b86a-e90940ea3a3a');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n","\n","\n","<div id=\"df-f8c203be-b0b6-4914-8641-c7cadd5ab4f7\">\n"," <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-f8c203be-b0b6-4914-8641-c7cadd5ab4f7')\"\n"," title=\"Suggest charts\"\n"," style=\"display:none;\">\n","\n","<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <g>\n"," <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n"," </g>\n","</svg>\n"," </button>\n","\n","<style>\n"," .colab-df-quickchart {\n"," --bg-color: #E8F0FE;\n"," --fill-color: #1967D2;\n"," --hover-bg-color: #E2EBFA;\n"," --hover-fill-color: #174EA6;\n"," --disabled-fill-color: #AAA;\n"," --disabled-bg-color: #DDD;\n"," }\n","\n"," [theme=dark] .colab-df-quickchart {\n"," --bg-color: #3B4455;\n"," --fill-color: #D2E3FC;\n"," --hover-bg-color: #434B5C;\n"," --hover-fill-color: #FFFFFF;\n"," --disabled-bg-color: #3B4455;\n"," --disabled-fill-color: #666;\n"," }\n","\n"," .colab-df-quickchart {\n"," background-color: var(--bg-color);\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: var(--fill-color);\n"," height: 32px;\n"," padding: 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-quickchart:hover {\n"," background-color: var(--hover-bg-color);\n"," box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: var(--button-hover-fill-color);\n"," }\n","\n"," .colab-df-quickchart-complete:disabled,\n"," .colab-df-quickchart-complete:disabled:hover {\n"," background-color: var(--disabled-bg-color);\n"," fill: var(--disabled-fill-color);\n"," box-shadow: none;\n"," }\n","\n"," .colab-df-spinner {\n"," border: 2px solid var(--fill-color);\n"," border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," animation:\n"," spin 1s steps(1) infinite;\n"," }\n","\n"," @keyframes spin {\n"," 0% {\n"," border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," border-left-color: var(--fill-color);\n"," }\n"," 20% {\n"," border-color: transparent;\n"," border-left-color: var(--fill-color);\n"," border-top-color: var(--fill-color);\n"," }\n"," 30% {\n"," border-color: transparent;\n"," border-left-color: var(--fill-color);\n"," border-top-color: var(--fill-color);\n"," border-right-color: var(--fill-color);\n"," }\n"," 40% {\n"," border-color: transparent;\n"," border-right-color: var(--fill-color);\n"," border-top-color: var(--fill-color);\n"," }\n"," 60% {\n"," border-color: transparent;\n"," border-right-color: var(--fill-color);\n"," }\n"," 80% {\n"," border-color: transparent;\n"," border-right-color: var(--fill-color);\n"," border-bottom-color: var(--fill-color);\n"," }\n"," 90% {\n"," border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," }\n"," }\n","</style>\n","\n"," <script>\n"," async function quickchart(key) {\n"," const quickchartButtonEl =\n"," document.querySelector('#' + key + ' button');\n"," quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n"," quickchartButtonEl.classList.add('colab-df-spinner');\n"," try {\n"," const charts = await google.colab.kernel.invokeFunction(\n"," 'suggestCharts', [key], {});\n"," } catch (error) {\n"," console.error('Error during call to suggestCharts:', error);\n"," }\n"," quickchartButtonEl.classList.remove('colab-df-spinner');\n"," quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n"," }\n"," (() => {\n"," let quickchartButtonEl =\n"," document.querySelector('#df-f8c203be-b0b6-4914-8641-c7cadd5ab4f7 button');\n"," quickchartButtonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n"," })();\n"," </script>\n","</div>\n"," </div>\n"," </div>\n"],"text/plain":[" comment toxic\n","0 Верблюдов-то за что? Дебилы, бл...\\n 1.0\n","1 Хохлы, это отдушина затюканого россиянина, мол... 1.0\n","2 Собаке - собачья смерть\\n 1.0\n","3 Страницу обнови, дебил. Это тоже не оскорблени... 1.0\n","4 тебя не убедил 6-страничный пдф в том, что Скр... 1.0\n","... ... ...\n","14407 Вонючий совковый скот прибежал и ноет. А вот и... 1.0\n","14408 А кого любить? Гоблина тупорылого что-ли? Или ... 1.0\n","14409 Посмотрел Утомленных солнцем 2. И оказалось, ч... 0.0\n","14410 КРЫМОТРЕД НАРУШАЕТ ПРАВИЛА РАЗДЕЛА Т.К В НЕМ Н... 1.0\n","14411 До сих пор пересматриваю его видео. Орамбо кст... 0.0\n","\n","[14412 rows x 2 columns]"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["df"]},{"cell_type":"code","execution_count":15,"metadata":{"executionInfo":{"elapsed":2,"status":"ok","timestamp":1717068388527,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"gaMP-mo0O-a6"},"outputs":[],"source":["import nltk"]},{"cell_type":"code","execution_count":16,"metadata":{"executionInfo":{"elapsed":3,"status":"ok","timestamp":1717068391596,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"8vtOsNYpPh4P"},"outputs":[],"source":["# !pip install nltk"]},{"cell_type":"code","execution_count":17,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1717068394415,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"IJVUmClVPglC","outputId":"8a917fc4-c865-41e7-8590-1f9f0fdc2a05"},"outputs":[{"name":"stderr","output_type":"stream","text":["[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data] Unzipping corpora/stopwords.zip.\n"]},{"data":{"text/plain":["True"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["nltk.download('stopwords')"]},{"cell_type":"code","execution_count":18,"metadata":{"executionInfo":{"elapsed":269,"status":"ok","timestamp":1717068398821,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"ShpWz1gRO7OD"},"outputs":[],"source":["stop_words = set(stopwords.words(\"russian\"))"]},{"cell_type":"code","execution_count":20,"metadata":{"executionInfo":{"elapsed":2,"status":"ok","timestamp":1717068486724,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"Ki_SvBdYOIr5"},"outputs":[],"source":["def clean_text(text):\n"," # Удаление всего, что не является буквами или знаками препинания\n"," clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\\s]')\n"," text = clean_pattern.sub('', text)\n"," url_pattern = re.compile(r'http\\S+|www\\S+|https\\S+')\n"," text = url_pattern.sub(r'', text)\n"," text = re.sub(\"\\s+\", \" \", text)\n"," splitted_text = [word for word in text.split() if word not in stop_words]\n"," text = \" \".join(splitted_text)\n"," return text"]},{"cell_type":"code","execution_count":21,"metadata":{"executionInfo":{"elapsed":835,"status":"ok","timestamp":1717068490177,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"nfXeCbYkOSAn"},"outputs":[],"source":["df['comment'] = df['comment'].apply(clean_text)"]},{"cell_type":"code","execution_count":22,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":424},"executionInfo":{"elapsed":319,"status":"ok","timestamp":1717068491955,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"nrtGQtc0OZaF","outputId":"42ba7437-0a23-414f-c1a6-7c717a973a79"},"outputs":[{"data":{"application/vnd.google.colaboratory.intrinsic+json":{"summary":"{\n \"name\": \"df\",\n \"rows\": 14412,\n \"fields\": [\n {\n \"column\": \"comment\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 14154,\n \"samples\": [\n \"\\u041a\\u043e\\u0433\\u0434\\u0430 \\u0448\\u0430\\u0440\\u0430\\u0445\\u043d\\u0443\\u043b 6 \\u0433\\u043e\\u0434\\u0443, 4 \\u043c\\u0435\\u0441\\u044f\\u0446\\u0432\\u043e\\u043e\\u0431\\u0449\\u0435 \\u0441\\u0442\\u0430\\u043b\\u043e \\u043e\\u0431\\u044b\\u0447\\u043d\\u044b\\u043c \\u0434\\u0435\\u043b\\u043e\\u043c\",\n \"\\u041b\\u0435\\u0442 7 \\u043d\\u0430\\u0437\\u0430\\u0434 \\u0436\\u0438\\u043b \\u0434\\u043e\\u043c\\u0435 \\u043e\\u0434\\u043d\\u0438\\u043c \\u043f\\u043e\\u0434\\u044a\\u0435\\u0437\\u0434\\u043e\\u043c. \\u0417\\u0432\\u043e\\u043d\\u0438\\u043b \\u0421\\u043a\\u043e\\u0440\\u0443\\u044e \\u0442\\u0440\\u0430\\u0432\\u043c\\u0435 \\u0433\\u043e\\u043b\\u043e\\u0432\\u044b \\u0431\\u043b\\u0438\\u0437\\u043a\\u043e\\u0433\\u043e \\u0447\\u0435\\u043b\\u043e\\u0432\\u0435\\u043a\\u0430 \\u043e\\u043f\\u0435\\u0440\\u0430\\u0442\\u043e\\u0440 \\u043e\\u043f\\u0440\\u043e\\u0441\\u0438\\u0432 \\u043e\\u0431\\u043e \\u0432\\u0441\\u0435\\u043c \\u0434\\u043e\\u0445\\u043e\\u0434\\u0438\\u0442 \\u0432\\u043e\\u043f\\u0440\\u043e\\u0441\\u0430 \\u043d\\u043e\\u043c\\u0435\\u0440 \\u043f\\u043e\\u0434\\u044a\\u0435\\u0437\\u0434\\u0430 , \\u0433\\u043e\\u0432\\u043e\\u0440\\u044e, \\u043f\\u043e\\u0434\\u044a\\u0435\\u0437\\u0434 1 \\u043d\\u043e\\u043c\\u0435\\u0440\\u0430 \\u0432\\u0438\\u0441\\u0435\\u043b\\u043e. \\u0425\\u043e\\u043b\\u043e\\u0434\\u043d\\u044b\\u043c \\u0442\\u043e\\u043d\\u043e\\u043c \\u043f\\u043e\\u0432\\u0442\\u043e\\u0440\\u0438\\u043b\\u0430 \\u0432\\u043e\\u043f\\u0440\\u043e\\u0441. \\u042f \\u0431\\u044b\\u0441\\u0442\\u0440\\u043e \\u043f\\u043e\\u043f\\u044b\\u0442\\u0430\\u043b\\u0441\\u044f \\u043f\\u043e\\u0432\\u0442\\u043e\\u0440\\u0438\\u0442\\u044c \\u043e\\u0442\\u0432\\u0435\\u0442, \\u0431\\u043e\\u044f\\u043b\\u0441\\u044f \\u043e\\u0448\\u0438\\u0431\\u0438\\u0442\\u044c\\u0441\\u044f 23 \\u043f\\u043e\\u0434\\u044a\\u0435\\u0437\\u0434\\u043e\\u043c, \\u0441\\u043a\\u043e\\u0440\\u0430\\u044f \\u043d\\u0435\\u043c\\u0443 \\u043f\\u043e\\u0434\\u044a\\u0435\\u0434\\u0435\\u0442 \\u0434\\u0430\\u043b\\u0435\\u043a\\u043e \\u0438\\u0434\\u0442\\u0438 \\u043c\\u043e\\u0435\\u0433\\u043e \\u043d\\u043e\\u043c\\u0435\\u0440\\u0430. 4 \\u043f\\u043e\\u043f\\u044b\\u0442\\u043a\\u0438 \\u0431\\u0440\\u043e\\u0441\\u0438\\u043b \\u0442\\u0440\\u0443\\u0431\\u043a\\u0443, \\u0432\\u044b\\u0432\\u0435\\u043b \\u0442\\u0440\\u0430\\u0432\\u043c\\u0438\\u0440\\u043e\\u0432\\u0430\\u043d\\u043e\\u0433\\u043e \\u0443\\u043b\\u0438\\u0446\\u0443 \\u0441\\u0440\\u0430\\u0437\\u0443 \\u043a\\u0430\\u0440\\u0442\\u0438\\u043d\\u0443 \\u0437\\u0430\\u043c\\u0435\\u0442\\u0438\\u043b \\u0441\\u043e\\u0441\\u0435\\u0434 \\u043c\\u0430\\u0448\\u0438\\u043d\\u043e\\u0439, 10 \\u043c\\u0438\\u043d\\u0443\\u0442 \\u043f\\u0440\\u0438\\u0451\\u043c\\u043d\\u043e\\u043c \\u043e\\u0442\\u0434\\u0435\\u043b\\u0435\\u043d\\u0438\\u0438.\",\n \"\\u041f\\u0440\\u0438\\u0431\\u044b\\u043b\\u044c \\u0431\\u0430\\u043d\\u043a\\u0430 \\u0443\\u043c\\u0435\\u043d\\u044c\\u0448\\u0438\\u0442\\u044c\\u0441\\u044f \\u0441\\u0447\\u0451\\u0442 \\u043d\\u0430\\u0447\\u0438\\u0441\\u043b\\u0435\\u043d\\u044b\\u0445 \\u043f\\u0440\\u043e\\u0446\\u0435\\u043d\\u0442\\u043e\\u0432 30 \\u0434\\u043d\\u0435\\u0439, \\u0441\\u0447\\u0451\\u0442 \\u0434\\u043e\\u0441\\u0442\\u0430\\u0442\\u043e\\u0447\\u043d\\u043e\\u0433\\u043e \\u043a\\u043e\\u043b\\u0438\\u0447\\u0435\\u0441\\u0442\\u0432\\u0430 \\u043f\\u0435\\u0440\\u0441\\u043e\\u043d\\u0430\\u043b\\u0430, \\u043e\\u0431\\u0440\\u0430\\u0431\\u0430\\u0442\\u044b\\u0432\\u0430\\u0442\\u044c \\u0437\\u0430\\u044f\\u0432\\u043b\\u0435\\u043d\\u0438\\u044f \\u0431\\u044b\\u0441\\u0442\\u0440\\u0435\\u0435 30 \\u0434\\u043d\\u0435\\u0439. \\u0410 \\u0437\\u0430\\u0451\\u043c\\u0449\\u0438\\u043a \\u043f\\u0435\\u0440\\u0432\\u0443\\u044e \\u043e\\u0447\\u0435\\u0440\\u0435\\u0434\\u044c \\u0434\\u043e\\u043b\\u0436\\u0435\\u043d \\u0434\\u043e\\u0433\\u043e\\u0432\\u043e\\u0440 \\u043f\\u0440\\u043e\\u0447\\u0438\\u0442\\u0430\\u0442\\u044c \\u043f\\u043e\\u043b\\u044c\\u0437\\u043e\\u0432\\u0430\\u0442\\u044c\\u0441\\u044f \\u043f\\u0440\\u043e\\u0434\\u0443\\u043a\\u0442\\u0430\\u043c\\u0438, \\u043a\\u043e\\u0442\\u043e\\u0440\\u044b\\u0435 \\u0443\\u0441\\u0442\\u0440\\u0430\\u0438\\u0432\\u0430\\u044e\\u0442\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"toxic\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47195781877088117,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}","type":"dataframe","variable_name":"df"},"text/html":["\n"," <div id=\"df-aedf5b20-db32-4ee5-9245-04b80f05e2ee\" class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>comment</th>\n"," <th>toxic</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Верблюдовто что? Дебилы, бл...</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Хохлы, это отдушина затюканого россиянина, мол...</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Собаке собачья смерть</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>Страницу обнови, дебил. Это оскорбление, доказ...</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>убедил 6страничный пдф том, Скрипалей отравила...</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>14407</th>\n"," <td>Вонючий совковый скот прибежал ноет. А сторонн...</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>14408</th>\n"," <td>А кого любить? Гоблина тупорылого чтоли? Или к...</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>14409</th>\n"," <td>Посмотрел Утомленных солнцем 2. И оказалось, э...</td>\n"," <td>0.0</td>\n"," </tr>\n"," <tr>\n"," <th>14410</th>\n"," <td>КРЫМОТРЕД НАРУШАЕТ ПРАВИЛА РАЗДЕЛА Т.К В НЕМ Н...</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>14411</th>\n"," <td>До сих пор пересматриваю видео. Орамбо кстати ...</td>\n"," <td>0.0</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>14412 rows × 2 columns</p>\n","</div>\n"," <div class=\"colab-df-buttons\">\n","\n"," <div class=\"colab-df-container\">\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-aedf5b20-db32-4ee5-9245-04b80f05e2ee')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n","\n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n"," <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n"," </svg>\n"," </button>\n","\n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," .colab-df-buttons div {\n"," margin-bottom: 4px;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-aedf5b20-db32-4ee5-9245-04b80f05e2ee button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-aedf5b20-db32-4ee5-9245-04b80f05e2ee');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n","\n","\n","<div id=\"df-e9f11f07-c423-416f-b45a-4bfcd208a9e6\">\n"," <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-e9f11f07-c423-416f-b45a-4bfcd208a9e6')\"\n"," title=\"Suggest charts\"\n"," style=\"display:none;\">\n","\n","<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <g>\n"," <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n"," </g>\n","</svg>\n"," </button>\n","\n","<style>\n"," .colab-df-quickchart {\n"," --bg-color: #E8F0FE;\n"," --fill-color: #1967D2;\n"," --hover-bg-color: #E2EBFA;\n"," --hover-fill-color: #174EA6;\n"," --disabled-fill-color: #AAA;\n"," --disabled-bg-color: #DDD;\n"," }\n","\n"," [theme=dark] .colab-df-quickchart {\n"," --bg-color: #3B4455;\n"," --fill-color: #D2E3FC;\n"," --hover-bg-color: #434B5C;\n"," --hover-fill-color: #FFFFFF;\n"," --disabled-bg-color: #3B4455;\n"," --disabled-fill-color: #666;\n"," }\n","\n"," .colab-df-quickchart {\n"," background-color: var(--bg-color);\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: var(--fill-color);\n"," height: 32px;\n"," padding: 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-quickchart:hover {\n"," background-color: var(--hover-bg-color);\n"," box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: var(--button-hover-fill-color);\n"," }\n","\n"," .colab-df-quickchart-complete:disabled,\n"," .colab-df-quickchart-complete:disabled:hover {\n"," background-color: var(--disabled-bg-color);\n"," fill: var(--disabled-fill-color);\n"," box-shadow: none;\n"," }\n","\n"," .colab-df-spinner {\n"," border: 2px solid var(--fill-color);\n"," border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," animation:\n"," spin 1s steps(1) infinite;\n"," }\n","\n"," @keyframes spin {\n"," 0% {\n"," border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," border-left-color: var(--fill-color);\n"," }\n"," 20% {\n"," border-color: transparent;\n"," border-left-color: var(--fill-color);\n"," border-top-color: var(--fill-color);\n"," }\n"," 30% {\n"," border-color: transparent;\n"," border-left-color: var(--fill-color);\n"," border-top-color: var(--fill-color);\n"," border-right-color: var(--fill-color);\n"," }\n"," 40% {\n"," border-color: transparent;\n"," border-right-color: var(--fill-color);\n"," border-top-color: var(--fill-color);\n"," }\n"," 60% {\n"," border-color: transparent;\n"," border-right-color: var(--fill-color);\n"," }\n"," 80% {\n"," border-color: transparent;\n"," border-right-color: var(--fill-color);\n"," border-bottom-color: var(--fill-color);\n"," }\n"," 90% {\n"," border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," }\n"," }\n","</style>\n","\n"," <script>\n"," async function quickchart(key) {\n"," const quickchartButtonEl =\n"," document.querySelector('#' + key + ' button');\n"," quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n"," quickchartButtonEl.classList.add('colab-df-spinner');\n"," try {\n"," const charts = await google.colab.kernel.invokeFunction(\n"," 'suggestCharts', [key], {});\n"," } catch (error) {\n"," console.error('Error during call to suggestCharts:', error);\n"," }\n"," quickchartButtonEl.classList.remove('colab-df-spinner');\n"," quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n"," }\n"," (() => {\n"," let quickchartButtonEl =\n"," document.querySelector('#df-e9f11f07-c423-416f-b45a-4bfcd208a9e6 button');\n"," quickchartButtonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n"," })();\n"," </script>\n","</div>\n"," </div>\n"," </div>\n"],"text/plain":[" comment toxic\n","0 Верблюдовто что? Дебилы, бл... 1.0\n","1 Хохлы, это отдушина затюканого россиянина, мол... 1.0\n","2 Собаке собачья смерть 1.0\n","3 Страницу обнови, дебил. Это оскорбление, доказ... 1.0\n","4 убедил 6страничный пдф том, Скрипалей отравила... 1.0\n","... ... ...\n","14407 Вонючий совковый скот прибежал ноет. А сторонн... 1.0\n","14408 А кого любить? Гоблина тупорылого чтоли? Или к... 1.0\n","14409 Посмотрел Утомленных солнцем 2. И оказалось, э... 0.0\n","14410 КРЫМОТРЕД НАРУШАЕТ ПРАВИЛА РАЗДЕЛА Т.К В НЕМ Н... 1.0\n","14411 До сих пор пересматриваю видео. Орамбо кстати ... 0.0\n","\n","[14412 rows x 2 columns]"]},"execution_count":22,"metadata":{},"output_type":"execute_result"}],"source":["df"]},{"cell_type":"code","execution_count":45,"metadata":{"executionInfo":{"elapsed":663,"status":"ok","timestamp":1717069872952,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"yeEiYdqWRz9M"},"outputs":[],"source":["import torch\n","from transformers import AutoTokenizer, AutoModelForSequenceClassification\n","\n","model_checkpoint = 'cointegrated/rubert-tiny-toxicity'\n","tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n","model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)\n","if torch.cuda.is_available():\n"," model.cuda()"]},{"cell_type":"code","execution_count":46,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":291,"status":"ok","timestamp":1717069879631,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"SymPcx7vVrps","outputId":"6a1b8850-e210-4607-d485-cc28f1acf7df"},"outputs":[{"data":{"text/plain":["BertForSequenceClassification(\n"," (bert): BertModel(\n"," (embeddings): BertEmbeddings(\n"," (word_embeddings): Embedding(29564, 312, padding_idx=0)\n"," (position_embeddings): Embedding(512, 312)\n"," (token_type_embeddings): Embedding(2, 312)\n"," (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)\n"," (dropout): Dropout(p=0.1, inplace=False)\n"," )\n"," (encoder): BertEncoder(\n"," (layer): ModuleList(\n"," (0-2): 3 x BertLayer(\n"," (attention): BertAttention(\n"," (self): BertSdpaSelfAttention(\n"," (query): Linear(in_features=312, out_features=312, bias=True)\n"," (key): Linear(in_features=312, out_features=312, bias=True)\n"," (value): Linear(in_features=312, out_features=312, bias=True)\n"," (dropout): Dropout(p=0.1, inplace=False)\n"," )\n"," (output): BertSelfOutput(\n"," (dense): Linear(in_features=312, out_features=312, bias=True)\n"," (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)\n"," (dropout): Dropout(p=0.1, inplace=False)\n"," )\n"," )\n"," (intermediate): BertIntermediate(\n"," (dense): Linear(in_features=312, out_features=600, bias=True)\n"," (intermediate_act_fn): GELUActivation()\n"," )\n"," (output): BertOutput(\n"," (dense): Linear(in_features=600, out_features=312, bias=True)\n"," (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)\n"," (dropout): Dropout(p=0.1, inplace=False)\n"," )\n"," )\n"," )\n"," )\n"," (pooler): BertPooler(\n"," (dense): Linear(in_features=312, out_features=312, bias=True)\n"," (activation): Tanh()\n"," )\n"," )\n"," (dropout): Dropout(p=0.1, inplace=False)\n"," (classifier): Linear(in_features=312, out_features=5, bias=True)\n",")"]},"execution_count":46,"metadata":{},"output_type":"execute_result"}],"source":["model"]},{"cell_type":"code","execution_count":24,"metadata":{"executionInfo":{"elapsed":276,"status":"ok","timestamp":1717069000589,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"6c4sA0oQSah5"},"outputs":[],"source":["from sklearn.model_selection import train_test_split"]},{"cell_type":"code","execution_count":25,"metadata":{"executionInfo":{"elapsed":244,"status":"ok","timestamp":1717069049509,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"IYbB9fpKSPN4"},"outputs":[],"source":["X_train, X_test, y_train, y_test = train_test_split(df['comment'], df['toxic'], test_size=0.2, random_state=42)"]},{"cell_type":"code","execution_count":27,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":6,"status":"ok","timestamp":1717069113528,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"Z2hdPUhrSyVR","outputId":"0d7664db-3df0-467b-cd99-13631d53b265"},"outputs":[{"data":{"text/plain":["160.0"]},"execution_count":27,"metadata":{},"output_type":"execute_result"}],"source":["lengths = [len(review) for review in df['comment']]\n","\n","# Шаг 3: Вычислить 75-й квантиль длины отзывов\n","quantile_75 = np.percentile(lengths, 75)\n","quantile_75"]},{"cell_type":"code","execution_count":28,"metadata":{"executionInfo":{"elapsed":241,"status":"ok","timestamp":1717069120517,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"Z0Z0HuTHSpu9"},"outputs":[],"source":["MAX_LEN = 100"]},{"cell_type":"code","execution_count":29,"metadata":{"executionInfo":{"elapsed":6393,"status":"ok","timestamp":1717069128723,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"D1kEbsZsSO_4"},"outputs":[],"source":["tokenized_train = X_train.apply((lambda x: tokenizer.encode(x,\n"," add_special_tokens=True,\n"," truncation=True,\n"," padding='max_length',\n"," max_length=MAX_LEN)))"]},{"cell_type":"code","execution_count":30,"metadata":{"executionInfo":{"elapsed":1408,"status":"ok","timestamp":1717069139904,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"w9fDzcLTSo-0"},"outputs":[],"source":["tokenized_valid = X_test.apply((lambda x: tokenizer.encode(x,\n"," add_special_tokens=True,\n"," truncation=True,\n"," padding='max_length',\n"," max_length=MAX_LEN)))"]},{"cell_type":"code","execution_count":31,"metadata":{"executionInfo":{"elapsed":280,"status":"ok","timestamp":1717069158349,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"y0JTa8XgS8XT"},"outputs":[],"source":["attention_mask_train = np.where(np.array(list(tokenized_train.values)) != 0, 1, 0)"]},{"cell_type":"code","execution_count":32,"metadata":{"executionInfo":{"elapsed":456,"status":"ok","timestamp":1717069173165,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"W04NPnoqTBR8"},"outputs":[],"source":["attention_mask_valid = np.where(np.array(list(tokenized_valid.values)) != 0, 1, 0)"]},{"cell_type":"code","execution_count":33,"metadata":{"executionInfo":{"elapsed":263,"status":"ok","timestamp":1717069622940,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"lbzBRiSlTUTy"},"outputs":[],"source":["from torch.utils.data import Dataset"]},{"cell_type":"code","execution_count":34,"metadata":{"executionInfo":{"elapsed":261,"status":"ok","timestamp":1717069625066,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"37ZKdpt2TUTy"},"outputs":[],"source":["class BertInputs(torch.utils.data.Dataset):\n"," def __init__(self, tokenized_inputs, attention_masks, targets):\n"," self.tokenized_inputs = tokenized_inputs\n"," self.attention_masks = attention_masks\n"," self.targets = targets\n","\n"," def __len__(self):\n"," return self.tokenized_inputs.shape[0]\n","\n"," def __getitem__(self, idx):\n"," ids = self.tokenized_inputs[idx]\n"," ams = self.attention_masks[idx]\n"," target = self.targets[idx]\n","\n"," return ids, ams, target\n"]},{"cell_type":"code","execution_count":35,"metadata":{"executionInfo":{"elapsed":2,"status":"ok","timestamp":1717069626758,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"qSxgGiCUTUTy"},"outputs":[],"source":["train_tokens = np.array(list(tokenized_train.values))"]},{"cell_type":"code","execution_count":36,"metadata":{"executionInfo":{"elapsed":268,"status":"ok","timestamp":1717069635938,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"RhxDZ1ouTUTz"},"outputs":[],"source":["valid_tokens = np.array(list(tokenized_valid.values))"]},{"cell_type":"code","execution_count":41,"metadata":{"executionInfo":{"elapsed":4,"status":"ok","timestamp":1717069778832,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"p5SzOedyVJum"},"outputs":[],"source":["target_train = y_train.to_numpy()\n","target_valid = y_test.to_numpy()"]},{"cell_type":"code","execution_count":39,"metadata":{"executionInfo":{"elapsed":344,"status":"ok","timestamp":1717069760952,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"MDvj3KOKTUTz"},"outputs":[],"source":["train_dataset = BertInputs(torch.from_numpy(train_tokens), attention_mask_train, torch.from_numpy(target_train))"]},{"cell_type":"code","execution_count":42,"metadata":{"executionInfo":{"elapsed":329,"status":"ok","timestamp":1717069780569,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"ocyD7tDRTUTz"},"outputs":[],"source":["valid_dataset = BertInputs(torch.from_numpy(valid_tokens), attention_mask_valid, torch.from_numpy(target_valid))"]},{"cell_type":"code","execution_count":43,"metadata":{"executionInfo":{"elapsed":308,"status":"ok","timestamp":1717069788867,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"8HCB_1AoTUTz"},"outputs":[],"source":["train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=False)\n","valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=False)"]},{"cell_type":"code","execution_count":107,"metadata":{"executionInfo":{"elapsed":292,"status":"ok","timestamp":1717072670743,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"wKpry0axTUTz"},"outputs":[],"source":["class BERTClassifier(nn.Module):\n"," def __init__(self):\n"," super().__init__()\n"," self.bert = AutoModelForSequenceClassification.from_pretrained('cointegrated/rubert-tiny-toxicity')\n"," self.bert.classifier = nn.Linear(312, 312)\n"," for param in self.bert.parameters():\n"," param.requires_grad = False\n"," self.linear = nn.Sequential(\n"," nn.Linear(312, 128),\n"," nn.Sigmoid(),\n"," nn.Dropout(),\n"," nn.Linear(128, 1)\n"," )\n","\n"," def forward(self, x, attention_mask=None):\n"," bert_out = self.bert(x, attention_mask=attention_mask).logits\n"," out = self.linear(bert_out).squeeze(1)\n"," return out"]},{"cell_type":"code","execution_count":108,"metadata":{"executionInfo":{"elapsed":6,"status":"ok","timestamp":1717072670744,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"VsusHb9ZZ9FN"},"outputs":[],"source":["# !pip install torchmetrics"]},{"cell_type":"code","execution_count":109,"metadata":{"executionInfo":{"elapsed":3,"status":"ok","timestamp":1717072671174,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"ZWve7ng-Z6xP"},"outputs":[],"source":["from torchmetrics import Accuracy\n","from torchmetrics.classification import BinaryF1Score"]},{"cell_type":"code","execution_count":110,"metadata":{"executionInfo":{"elapsed":3,"status":"ok","timestamp":1717072671456,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"CK9G-kRzZ6xQ"},"outputs":[],"source":["model = BERTClassifier()\n","device = 'cuda'\n","criterion = nn.BCEWithLogitsLoss()\n","optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)\n","metric = BinaryF1Score().to(device)\n","model.to(device);"]},{"cell_type":"code","execution_count":111,"metadata":{"executionInfo":{"elapsed":4,"status":"ok","timestamp":1717072671705,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"QqCSRkOKZsgI"},"outputs":[],"source":["def train_attention_lstm(epochs, model, train_loader, valid_loader, optimizer,\n"," criterion, metric, lstm_conf=None):\n"," epoch_train_losses = []\n"," epoch_valid_losses = []\n"," epoch_train_metric = []\n"," epoch_valid_metric = []\n"," device = 'cuda'\n"," for epoch in range(epochs):\n"," batch_losses = []\n"," batch_metric = []\n"," model.train()\n"," model.to(device)\n"," for inputs, attention_masks, labels in train_loader:\n"," inputs, attention_masks, labels = inputs.to(device), attention_masks.to(device), labels.to(device)\n","\n"," output = model(inputs, attention_mask=attention_masks)\n"," loss = criterion(output, labels)\n"," optimizer.zero_grad()\n"," loss.backward()\n"," optimizer.step()\n","\n"," batch_losses.append(loss.item())\n"," batch_metric.append(metric(output, labels).item())\n","\n"," epoch_train_losses.append(np.mean(batch_losses))\n"," epoch_train_metric.append(np.mean(batch_metric))\n","\n"," batch_losses = []\n"," batch_metric = []\n"," model.eval()\n"," for inputs, attention_masks, labels in valid_loader:\n"," inputs, attention_masks, labels = inputs.to(device), attention_masks.to(device), labels.to(device)\n"," with torch.no_grad():\n"," output = model(inputs, attention_mask=attention_masks)\n"," loss = criterion(output, labels)\n"," batch_losses.append(loss.item())\n"," batch_metric.append(metric(output, labels).item())\n"," epoch_valid_losses.append(np.mean(batch_losses))\n"," epoch_valid_metric.append(np.mean(batch_metric))\n","\n"," print(f'Epoch {epoch+1}')\n"," print(f'Train loss: {epoch_train_losses[-1]:.4f}, Val loss {epoch_valid_losses[-1]:.4f}')\n"," print(f'Train accuracy: {epoch_train_metric[-1]:.2f}, Val accuracy: {epoch_valid_metric[-1]:.2f}')\n"," print(25*'==')\n","\n"," return (epoch_train_losses, epoch_valid_losses, epoch_train_metric, epoch_valid_metric)"]},{"cell_type":"code","execution_count":112,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":86525,"status":"ok","timestamp":1717072758525,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"A1wQkEHTZxw4","outputId":"ca0403be-64c5-4864-a2f9-636b77991150"},"outputs":[{"name":"stdout","output_type":"stream","text":["Epoch 1\n","Train loss: 0.5084, Val loss 0.4024\n","Train accuracy: 0.57, Val accuracy: 0.70\n","==================================================\n","Epoch 2\n","Train loss: 0.4391, Val loss 0.3867\n","Train accuracy: 0.66, Val accuracy: 0.72\n","==================================================\n","Epoch 3\n","Train loss: 0.4279, Val loss 0.3793\n","Train accuracy: 0.68, Val accuracy: 0.73\n","==================================================\n","Epoch 4\n","Train loss: 0.4193, Val loss 0.3731\n","Train accuracy: 0.69, Val accuracy: 0.73\n","==================================================\n","Epoch 5\n","Train loss: 0.4131, Val loss 0.3679\n","Train accuracy: 0.70, Val accuracy: 0.74\n","==================================================\n","Epoch 6\n","Train loss: 0.4070, Val loss 0.3634\n","Train accuracy: 0.70, Val accuracy: 0.74\n","==================================================\n","Epoch 7\n","Train loss: 0.4082, Val loss 0.3593\n","Train accuracy: 0.70, Val accuracy: 0.75\n","==================================================\n","Epoch 8\n","Train loss: 0.4026, Val loss 0.3563\n","Train accuracy: 0.71, Val accuracy: 0.75\n","==================================================\n","Epoch 9\n","Train loss: 0.4043, Val loss 0.3537\n","Train accuracy: 0.71, Val accuracy: 0.76\n","==================================================\n","Epoch 10\n","Train loss: 0.4014, Val loss 0.3518\n","Train accuracy: 0.72, Val accuracy: 0.76\n","==================================================\n","Epoch 11\n","Train loss: 0.3974, Val loss 0.3497\n","Train accuracy: 0.72, Val accuracy: 0.76\n","==================================================\n","Epoch 12\n","Train loss: 0.3948, Val loss 0.3478\n","Train accuracy: 0.72, Val accuracy: 0.76\n","==================================================\n","Epoch 13\n","Train loss: 0.3952, Val loss 0.3467\n","Train accuracy: 0.72, Val accuracy: 0.76\n","==================================================\n","Epoch 14\n","Train loss: 0.3946, Val loss 0.3454\n","Train accuracy: 0.72, Val accuracy: 0.76\n","==================================================\n","Epoch 15\n","Train loss: 0.3930, Val loss 0.3444\n","Train accuracy: 0.72, Val accuracy: 0.77\n","==================================================\n","Epoch 16\n","Train loss: 0.3902, Val loss 0.3433\n","Train accuracy: 0.72, Val accuracy: 0.76\n","==================================================\n","Epoch 17\n","Train loss: 0.3901, Val loss 0.3424\n","Train accuracy: 0.72, Val accuracy: 0.77\n","==================================================\n","Epoch 18\n","Train loss: 0.3899, Val loss 0.3415\n","Train accuracy: 0.72, Val accuracy: 0.77\n","==================================================\n","Epoch 19\n","Train loss: 0.3905, Val loss 0.3409\n","Train accuracy: 0.72, Val accuracy: 0.77\n","==================================================\n","Epoch 20\n","Train loss: 0.3904, Val loss 0.3403\n","Train accuracy: 0.72, Val accuracy: 0.77\n","==================================================\n"]},{"data":{"text/plain":["([0.5083780060451313,\n"," 0.4391106702167688,\n"," 0.42793449899041985,\n"," 0.4192526624897032,\n"," 0.4130774913023195,\n"," 0.40695059533327993,\n"," 0.40816243859394574,\n"," 0.4025705500237704,\n"," 0.40428590983994134,\n"," 0.4014262039793654,\n"," 0.3973992633044861,\n"," 0.39483769866059215,\n"," 0.3951748218383066,\n"," 0.39460813710124554,\n"," 0.3929524585412899,\n"," 0.3901995505507913,\n"," 0.39005828239460105,\n"," 0.3898510054791003,\n"," 0.39051921931187866,\n"," 0.3903515106272374],\n"," [0.402441298047971,\n"," 0.3867063879904477,\n"," 0.3793247446857884,\n"," 0.3730928883862242,\n"," 0.3679111393862797,\n"," 0.3633536281389945,\n"," 0.3593491589004045,\n"," 0.3562914771151528,\n"," 0.3536964474871052,\n"," 0.35182306410159725,\n"," 0.34968449011011793,\n"," 0.3477907560270232,\n"," 0.34674735903661463,\n"," 0.3454434093392382,\n"," 0.34439929916767315,\n"," 0.34329559643294033,\n"," 0.34237983056187166,\n"," 0.3415207479647173,\n"," 0.3408517934908176,\n"," 0.3403323164568777],\n"," [0.5658733633829286,\n"," 0.6621058487908662,\n"," 0.6838039802464752,\n"," 0.6886554599386173,\n"," 0.6963648699492299,\n"," 0.6996440213803109,\n"," 0.7021754302311471,\n"," 0.7119714741148777,\n"," 0.710395372268896,\n"," 0.7155060225741685,\n"," 0.7166487282332952,\n"," 0.7205212973986966,\n"," 0.7195848430457868,\n"," 0.7156672836010476,\n"," 0.7237848192866159,\n"," 0.7244235358905264,\n"," 0.7235892567773274,\n"," 0.7228518685640721,\n"," 0.7241709839521683,\n"," 0.7238757334587647],\n"," [0.7004337032417675,\n"," 0.71641305907742,\n"," 0.728126830124593,\n"," 0.7323439026271904,\n"," 0.739565012219188,\n"," 0.7440549931028387,\n"," 0.7477671592445164,\n"," 0.7535042890480587,\n"," 0.7572787361485618,\n"," 0.7609272281547169,\n"," 0.7609272281547169,\n"," 0.7626037581281347,\n"," 0.7633639616625649,\n"," 0.7642797123599838,\n"," 0.7651110563304399,\n"," 0.7646179228693575,\n"," 0.7660046583348579,\n"," 0.768307666529666,\n"," 0.770193106853045,\n"," 0.7677900676543896])"]},"execution_count":112,"metadata":{},"output_type":"execute_result"}],"source":["train_attention_lstm(20, model, train_loader, valid_loader, optimizer, criterion, metric)"]},{"cell_type":"code","execution_count":113,"metadata":{"executionInfo":{"elapsed":282,"status":"ok","timestamp":1717073592487,"user":{"displayName":"вера великоборец","userId":"17606763383908550373"},"user_tz":-180},"id":"oXNFCJ4wbhUl"},"outputs":[],"source":["# Сохранение весов модели\n","torch.save(model.state_dict(), 'model_weights.pth')\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"BWYh3Sppj70S"},"outputs":[],"source":[]}],"metadata":{"accelerator":"GPU","colab":{"authorship_tag":"ABX9TyPBcq+impWauwHnDO3K9VQh","gpuType":"T4","mount_file_id":"1usIDPB7YVnxZIo3V3ggC49u8SFxGFn3M","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
pages/comments.py CHANGED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import sys
4
+ from pathlib import Path
5
+ import requests
6
+ import time
7
+ import cv2
8
+ import numpy as np
9
+ from transformers import AutoTokenizer
10
+
11
+
12
+ st.write("# Оценка степени токсичности пользовательского сообщения")
13
+ # st.write("Здесь вы можете загрузить картинку со своего устройства, либо при помощи ссылки")
14
+
15
+ # Добавление пути к проекту и моделям
16
+ project_root = Path(__file__).resolve().parents[1]
17
+ models_path = project_root / 'models'
18
+ sys.path.append(str(models_path))
19
+ from models.model2.preprocess_text import TextPreprocessorBERT
20
+ from models.model2.model import BERTClassifier
21
+
22
+ device = 'cpu'
23
+
24
+ # Загрузка модели и словаря
25
+ @st.cache_resource
26
+ def load_model():
27
+ model = BERTClassifier()
28
+ weights_path = models_path / 'model2' / 'model_weights.pth'
29
+ state_dict = torch.load(weights_path, map_location=device)
30
+ model.load_state_dict(state_dict)
31
+ model.to(device)
32
+ model.eval()
33
+ return model
34
+
35
+ @st.cache_resource
36
+ def load_tokenizer():
37
+ return AutoTokenizer.from_pretrained('cointegrated/rubert-tiny-toxicity')
38
+
39
+ model = load_model()
40
+ tokenizer = load_tokenizer()
41
+
42
+ input_text = st.text_area('Введите текст сообщения')
43
+
44
+ if st.button('Предсказать'):
45
+ # Применяем предобработку
46
+ preprocessor = TextPreprocessorBERT()
47
+ preprocessed_text = preprocessor.transform(input_text)
48
+
49
+ # Токенизация
50
+ tokens = tokenizer.encode_plus(
51
+ preprocessed_text,
52
+ add_special_tokens=True,
53
+ truncation=True,
54
+ max_length=100,
55
+ padding='max_length',
56
+ return_tensors='pt'
57
+ )
58
+
59
+ # Получаем input_ids и attention_mask из токенов
60
+ input_ids = tokens['input_ids'].to(device)
61
+ attention_mask = tokens['attention_mask'].to(device)
62
+
63
+ # Предсказание
64
+ with torch.no_grad():
65
+ output = model(input_ids, attention_mask=attention_mask)
66
+
67
+ # Интерпретация результата
68
+ prediction = torch.sigmoid(output).item()
69
+ st.write(f'Предсказанный класс токсичности: {prediction:.4f}')
70
+