Spaces:

AnastasiaMozhayskaya
/

RecSystems_Project1

Running

App Files Files Community

Анастасия commited on Dec 15, 2023

Commit

582663d

1 Parent(s): e0a865c

project_streamlit_app

Browse files

Files changed (22) hide show

.DS_Store +0 -0
.gitattributes +4 -0
__init__.py +0 -0
app.py +24 -0
cosine_similarity_model.ipynb +137 -0
data/.DS_Store +0 -0
data/data.csv +3 -0
data/embs.txt +3 -0
images/.DS_Store +0 -0
images/ser2.png +3 -0
model/config.json +3 -0
model/model.safetensors +3 -0
pages/.DS_Store +0 -0
pages/01_🎥_Serials.py +105 -0
pages/02_🔥_Results.py +15 -0
pages/__init__.py +0 -0
pages/__pycache__/__init__.cpython-310.pyc +0 -0
parsing.ipynb +448 -0
requirements.txt +3 -0
tokenizer/special_tokens_map.json +3 -0
tokenizer/tokenizer_config.json +3 -0
tokenizer/vocab.txt +3 -0

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
+*.txt filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import streamlit as st
+"""
+## Сервис умного поиска сериалов 📽️
+#### Проект по рекомендательным системам.
+"""
+st.image('images/ser2.png')
+st.write("""
+  #### Состав команды:
+  \n1. ##### [Анастасия](https://github.com/AnastasiaMozhayskaya) 🧝‍♀️
+  \n2. ##### [Роман](https://github.com/r-makushkin) 🦸‍♂️
+  \n3. ##### [Алексей](https://github.com/WeinsGH) 🦹‍♂️
+""", unsafe_allow_html=True)
+"""
+ #### Задачи:
+ \n- ###### Собрать выборку из не менее, чем 5000 сериалов
+ \n- ###### Разработать систему поиска сериала по пользовательскому запросу.
+ \n- ###### Описания сериалов на русском языке
+ \n- ###### Сервис должен принимать на вход описание сериала от \
+         пользователя и возвращать заданное количество подходящих вариантов.
+"""

cosine_similarity_model.ipynb ADDED Viewed

	@@ -0,0 +1,137 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/Anastasia/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import BertTokenizer, BertModel\n",
+    "import torch\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "import numpy as np\n",
+    "import time\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(array([ 5517,  9066, 13361, 11717,   320, 10793, 14201,  9305,  9199,\n",
+      "        8294]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))\n",
+      "3.533276081085205\n"
+     ]
+    }
+   ],
+   "source": [
+    "start_time = time.time()\n",
+    "\n",
+    "\n",
+    "# Читаем вектора сериалов\n",
+    "embeddings = np.loadtxt('data/embs.txt')\n",
+    "# Указываем пути к сохраненным модели и токенизатору\n",
+    "model_path = \"model\"\n",
+    "tokenizer_path = \"tokenizer\"\n",
+    "\n",
+    "# Загружаем модель\n",
+    "loaded_model = BertModel.from_pretrained(model_path)\n",
+    "\n",
+    "# Загружаем токенизатор\n",
+    "loaded_tokenizer = BertTokenizer.from_pretrained(tokenizer_path)\n",
+    "\n",
+    "\n",
+    "# Векторизуем запрос\n",
+    "loaded_model.eval()\n",
+    "tokens = loaded_tokenizer('петух закукарекал', return_tensors=\"pt\", padding=True, truncation=True)\n",
+    "\n",
+    "# Переместите токены на тот же устройство, что и модель\n",
+    "tokens = {key: value.to(loaded_model.device) for key, value in tokens.items()}\n",
+    "\n",
+    "# Передача токенов в модель для получения эмбеддингов\n",
+    "with torch.no_grad():\n",
+    "    output = loaded_model(**tokens)\n",
+    "\n",
+    "# Эмбеддинги получаются из последнего скрытого состояния\n",
+    "user_embedding = output.last_hidden_state.mean(dim=1).squeeze().cpu().detach().numpy()\n",
+    "\n",
+    "\n",
+    "\n",
+    "cosine_similarities = cosine_similarity(embeddings, user_embedding.reshape(1, -1))\n",
+    "\n",
+    "# Получаем 10 наиболее подходящих строк-индексов в массиве нампай\n",
+    "top_10_indices = np.unravel_index(np.argsort(cosine_similarities, axis=None)[-10:], cosine_similarities.shape)\n",
+    "print(top_10_indices)\n",
+    "end_time = time.time()\n",
+    "print(end_time-start_time)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[5517, 9066, 13361, 11717, 320, 10793, 14201, 9305, 9199, 8294]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(top_10_indices[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".elbrus2",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

data/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

data/data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6c10dbf7a899fbf0553bf6cab5fd11abf35cf224e4e6e4f7843fdd19144c550
+size 19266108

data/embs.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:738ec3d829fa4fe69441898cc7f16d3db560af029831428504c7326a5f4de3cf
+size 292731853

images/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

images/ser2.png ADDED Viewed

Git LFS Details

SHA256: 035a31442decd33706b2ffff57c59ef4a2363970e4c2c9e91d6a5efef4dd9191
Pointer size: 132 Bytes
Size of remote file: 1.34 MB

model/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47128322e633a57168f608258f7409ee2890186641e008059bcf1ba3010f3a61
+size 829

model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5726147049874a1efbf83c95cc45f74e24267f501998cdcecea06b474e9d16f
+size 711436136

pages/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

pages/01_🎥_Serials.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import ast
+import random
+import torch
+import time
+from joblib import load
+from transformers import BertTokenizer, BertModel
+from sklearn.metrics.pairwise import cosine_similarity
+# import faiss
+"""
+## Сервис умного поиска сериалов 📽️
+"""
+# Читаем вектора сериалов
+embeddings = np.loadtxt('data/embs.txt')
+# Указываем пути к сохраненным модели и токенизатору
+model_path = "model"
+tokenizer_path = "tokenizer"
+# Загружаем модель
+loaded_model = BertModel.from_pretrained(model_path)
+# Загружаем токенизатор
+loaded_tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
+df = pd.read_csv('data/data.csv')
+df['ganres'] = df['ganres'].apply(lambda x: ast.literal_eval(x))
+df['description'] = df['description'].astype(str)
+st.write(f'<p style="font-family: Arial, sans-serif; font-size: 24px; ">Наш сервис насчитывает \
+         {len(df)} лучших сериалов</p>', unsafe_allow_html=True)
+st.image('images/ser2.png')
+ganres_lst = sorted(['драма', 'документальный', 'биография', 'комедия', 'фэнтези', 'приключения', 'для детей', 'мультсериалы',
+              'мелодрама', 'боевик', 'детектив', 'фантастика', 'триллер', 'семейный', 'криминал', 'исторический', 'музыкальные',
+              'мистика', 'аниме', 'ужасы', 'спорт', 'скетч-шоу', 'военный', 'для взрослых', 'вестерн'])
+st.sidebar.header('Панель инструментов :gear:')
+choice_g = st.sidebar.multiselect("Выберите жанры", options=ganres_lst)
+n = st.sidebar.selectbox("Количество отображаемых элементов на странице", options=[5, 10, 15, 20, 30])
+st.sidebar.info("Для наилучшего соответствия, запрос должен быть максимально развернутым")
+text = st.text_input('Введите описание для рекомендации')
+# Векторизуем запрос
+loaded_model.eval()
+tokens = loaded_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+start_time = time.time()
+tokens = {key: value.to(loaded_model.device) for key, value in tokens.items()}
+# Передача токенов в модель для получения эмбеддингов
+with torch.no_grad():
+    output = loaded_model(**tokens)
+# Эмбеддинги получаются из последнего скрытого состояния
+user_embedding = output.last_hidden_state.mean(dim=1).squeeze().cpu().detach().numpy()
+cosine_similarities = cosine_similarity(embeddings, user_embedding.reshape(1, -1))
+button = st.button('Отправить запрос', type="primary")
+if text and button:
+    if len(choice_g) == 0:
+        choice_g = ganres_lst
+    # random = random.sample(range(len(df)), 50)
+    top_ind = np.unravel_index(np.argsort(cosine_similarities, axis=None)[-30:][::-1], cosine_similarities.shape)
+    confidence = cosine_similarities[top_ind]
+    top_ind = list(top_ind[0])
+    conf_dict = {}
+    for value, conf in zip(top_ind, confidence):
+        conf_dict[int(value)] = conf
+    # st.write(conf_dict)
+    output_dict = {}
+    for i in top_ind:
+        for ganre in df['ganres'][i]:
+            if ganre in choice_g:
+                output_dict[i] = df['ganres'][i]
+    # st.write('output_dict')
+    sorted_lst = sorted(output_dict.items(), key=lambda x: len(set(x[1]) & set(choice_g)), reverse=True)
+    n_lst = [i[0] for i in sorted_lst[:n]]
+    st.write(f'<p style="font-family: Arial, sans-serif; font-size: 18px; text-align: center;"><strong>Всего подобранных \
+         рекомендаций {len(sorted_lst)}</strong></p>', unsafe_allow_html=True)
+    st.write('\n')
+    # Отображение изображений и названий
+    for i in n_lst:
+        col1, col2 = st.columns([3, 4])
+        with col1:
+            st.image(df['poster'][i], width=300)
+        with col2:
+            st.write(f"***Название:*** {df['title'][i]}")
+            st.write(f"***Жанр:*** {', '.join(df['ganres'][i])}")
+            st.write(f"***Описание:*** {df['description'][i]}")
+            # similarity = float(confidence)
+            # st.write(f"***Cosine Similarity : {round(similarity, 3)}***")
+            st.write(f"***Ссылка на сериал: {df['url'][i]}***")
+            st.write(f"")
+            end_time = time.time()
+            st.write(f"<small>*Степень соответствия по косинусному сходству: {conf_dict[i]:.4f}*</small>", unsafe_allow_html=True)
+        st.markdown(
+        "<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
+        unsafe_allow_html=True
+    )

pages/02_🔥_Results.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import streamlit as st
+from PIL import Image
+st.write("""
+ ## 📝 Итоги проекта Рекомендательные системы.
+""")
+"""
+###### 1. Парсинг профильных сайтов.
+###### 2. Сбор и анализ информации с киносервисов. Формирование датасета. Итоговый размер - 14939 объектов.
+###### 3. Предобработка данных от лишных символов и пропусков.
+###### 4. Векторизация с использованием модели rubert-tiny2.
+"""

pages/__init__.py ADDED Viewed

File without changes

pages/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (203 Bytes). View file

parsing.ipynb ADDED Viewed

	@@ -0,0 +1,448 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "29cc0f22",
+   "metadata": {
+    "toc": true
+   },
+   "source": [
+    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
+    "<div class=\"toc\"><ul class=\"toc-item\"></ul></div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "fd5af781",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bs4 import BeautifulSoup\n",
+    "import requests\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "import fake_useragent\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "b6333adb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "user = fake_useragent.UserAgent().random"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "db46f2ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "posters = []\n",
+    "titles = []\n",
+    "sources = []\n",
+    "descriptions = []\n",
+    "ganres = []\n",
+    "filters = []\n",
+    "\n",
+    "with open('serials.txt') as file:\n",
+    "    lst = file.read().split('\\n')\n",
+    "s = 10000\n",
+    "for url in lst[10000:]:\n",
+    "    headers = {\n",
+    "        'Accept': '*/*',\n",
+    "        'User-Agent': user\n",
+    "    }\n",
+    "    req = requests.get(url, headers=headers)\n",
+    "    # req = requests.get(url)\n",
+    "    src = req.text\n",
+    "    s +=1\n",
+    "    print(s)\n",
+    "\n",
+    "    with open('index.html', 'w', encoding=\"utf-8\") as file:\n",
+    "        file.write(src)\n",
+    "\n",
+    "    with open('index.html', encoding=\"utf-8\") as file:\n",
+    "        src = file.read()\n",
+    "\n",
+    "    soup = BeautifulSoup(src, 'lxml')\n",
+    "    # тянем название\n",
+    "    try:\n",
+    "        title = soup.find(class_='text text_bold_giant color_white').text\n",
+    "        title = re.sub(r'\\([^)]*\\)', ' ', title).strip()\n",
+    "        titles.append(title)\n",
+    "    except:\n",
+    "        titles.append(None)\n",
+    "    # тянем постер\n",
+    "    try:\n",
+    "        picture_url = soup.find('meta', itemprop='image')\n",
+    "        picture_url = picture_url['content']\n",
+    "        posters.append(picture_url)\n",
+    "    except:\n",
+    "        posters.append(None)\n",
+    "    # тянем жанры\n",
+    "    ganre = soup.find_all('span', class_='badge__text')\n",
+    "    helper = []\n",
+    "    for i in ganre:\n",
+    "        helper.append(i.text)\n",
+    "    ganres.append(helper)\n",
+    "    # тянем описание\n",
+    "    try:\n",
+    "        description = soup.find('div', class_='p-movie-info__description-text').text\n",
+    "        descriptions.append(description)\n",
+    "    except:\n",
+    "        descriptions.append(None)\n",
+    "    # возраст\n",
+    "    try:\n",
+    "        age_filter = soup.find('span', class_='label_restrict').text\n",
+    "        filters.append(age_filter)\n",
+    "    except:\n",
+    "        filters.append(None)\n",
+    "    # url\n",
+    "    sources.append(url)\n",
+    "    if len(sources) % 1 == 0:\n",
+    "        res = pd.DataFrame({'url':sources,\n",
+    "                            'poster':posters,\n",
+    "                            'title':titles,\n",
+    "                            'ganres':ganres,\n",
+    "                            'description':descriptions,\n",
+    "                            'age_limit':filters})\n",
+    "        print(f'{len(res)} saved')\n",
+    "        res.to_csv('DATA.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "1cbe183b",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: 'DATA1.csv'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df1 \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mDATA1.csv\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mUnnamed: 0\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      2\u001b[0m df2 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDATA2.csv\u001b[39m\u001b[38;5;124m'\u001b[39m, index_col\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUnnamed: 0\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m      3\u001b[0m df3 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDATA3.csv\u001b[39m\u001b[38;5;124m'\u001b[39m, index_col\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUnnamed: 0\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
+      "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:948\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m    935\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m    936\u001b[0m     dialect,\n\u001b[1;32m    937\u001b[0m     delimiter,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    944\u001b[0m     dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m    945\u001b[0m )\n\u001b[1;32m    946\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 948\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:611\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    608\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m    610\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 611\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    613\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m    614\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
+      "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1448\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m   1445\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m   1447\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1448\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1705\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m   1703\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m   1704\u001b[0m         mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1705\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1706\u001b[0m \u001b[43m    \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1707\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1708\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1709\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1710\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1711\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1712\u001b[0m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1713\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1714\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1715\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1716\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
+      "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/common.py:863\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    858\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m    859\u001b[0m     \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m    860\u001b[0m     \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m    861\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m    862\u001b[0m         \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 863\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m    864\u001b[0m \u001b[43m            \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    865\u001b[0m \u001b[43m            \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    866\u001b[0m \u001b[43m            \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    867\u001b[0m \u001b[43m            \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    868\u001b[0m \u001b[43m            \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    869\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    870\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    871\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m    872\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'DATA1.csv'"
+     ]
+    }
+   ],
+   "source": [
+    "df1 = pd.read_csv('DATA1.csv', index_col='Unnamed: 0')\n",
+    "df2 = pd.read_csv('DATA2.csv', index_col='Unnamed: 0')\n",
+    "df3 = pd.read_csv('DATA3.csv', index_col='Unnamed: 0')\n",
+    "df4 = pd.read_csv('DATA4.csv', index_col='Unnamed: 0')\n",
+    "df5 = pd.read_csv('DATA5.csv', index_col='Unnamed: 0')\n",
+    "df6 = pd.read_csv('DATA6.csv', index_col='Unnamed: 0')\n",
+    "df7 = pd.read_csv('DATA7.csv', index_col='Unnamed: 0')\n",
+    "df8 = pd.read_csv('DATA8.csv', index_col='Unnamed: 0')\n",
+    "df9 = pd.read_csv('DATA9.csv', index_col='Unnamed: 0')\n",
+    "df10 = pd.read_csv('DATA10.csv', index_col='Unnamed: 0')\n",
+    "df11 = pd.read_csv('DATA11.csv', index_col='Unnamed: 0')\n",
+    "df12 = pd.read_csv('DATA12.csv', index_col='Unnamed: 0')\n",
+    "df13 = pd.read_csv('DATA13.csv', index_col='Unnamed: 0')\n",
+    "df14 = pd.read_csv('DATA14.csv', index_col='Unnamed: 0')\n",
+    "df15 = pd.read_csv('DATA15 - с 11880.csv', index_col='Unnamed: 0')\n",
+    "df16 = pd.read_csv('DATA16.csv', index_col='Unnamed: 0')\n",
+    "df17 = pd.read_csv('DATA17.csv', index_col='Unnamed: 0')\n",
+    "df18 = pd.read_csv('DATA18.csv', index_col='Unnamed: 0')\n",
+    "df19 = pd.read_csv('DATA19.csv', index_col='Unnamed: 0')\n",
+    "df20 = pd.read_csv('DATA20.csv', index_col='Unnamed: 0')\n",
+    "df21 = pd.read_csv('DATA21.csv', index_col='Unnamed: 0')\n",
+    "df22 = pd.read_csv('DATA22.csv', index_col='Unnamed: 0')\n",
+    "df23 = pd.read_csv('DATA23.csv', index_col='Unnamed: 0')\n",
+    "df24 = pd.read_csv('DATA24.csv', index_col='Unnamed: 0')\n",
+    "df25 = pd.read_csv('DATA25.csv', index_col='Unnamed: 0')\n",
+    "df26 = pd.read_csv('DATA26.csv', index_col='Unnamed: 0')\n",
+    "df27 = pd.read_csv('DATA27.csv', index_col='Unnamed: 0')\n",
+    "df28 = pd.read_csv('DATA0-5000.csv', index_col='Unnamed: 0')\n",
+    "df29 = pd.read_csv('DATA2-8.csv', index_col='Unnamed: 0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8cda987f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15, df16, df17, df18, df19, df20, \\\n",
+    "                   df21, df22, df23, df24, df25, df26, df27, df28, df29], axis=0).reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "222e1aef",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>url</th>\n",
+       "      <th>poster</th>\n",
+       "      <th>title</th>\n",
+       "      <th>ganres</th>\n",
+       "      <th>description</th>\n",
+       "      <th>age_limit</th>\n",
+       "      <th>Unnamed: 0.1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>14969</th>\n",
+       "      <td>https://kino.mail.ru/series_893084_rusalka/</td>\n",
+       "      <td>https://resizer.mail.ru/p/11575246-90fe-53c0-b...</td>\n",
+       "      <td>Русалка</td>\n",
+       "      <td>['мелодрама']</td>\n",
+       "      <td>Наташа Алпатова (Елена Шилова) — простая девуш...</td>\n",
+       "      <td>12 +</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14970</th>\n",
+       "      <td>https://kino.mail.ru/series_838624_sezon_ohoti/</td>\n",
+       "      <td>https://resizer.mail.ru/p/ca54339b-94e8-5813-a...</td>\n",
+       "      <td>Сезон охоты</td>\n",
+       "      <td>['драма', 'мелодрама', 'комедия', 'для взрослых']</td>\n",
+       "      <td>В центре сюжета — история молодого успешного б...</td>\n",
+       "      <td>18 +</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14971</th>\n",
+       "      <td>https://kino.mail.ru/series_783649_smertelnii_...</td>\n",
+       "      <td>https://resizer.mail.ru/p/767707c0-af9c-588a-a...</td>\n",
+       "      <td>Смертельный танец</td>\n",
+       "      <td>['детектив']</td>\n",
+       "      <td>В родное Заречье возвращается танцовщица Настя...</td>\n",
+       "      <td>16 +</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14972</th>\n",
+       "      <td>https://kino.mail.ru/series_781099_fantom/</td>\n",
+       "      <td>https://resizer.mail.ru/p/07f60bae-b56a-58ea-b...</td>\n",
+       "      <td>Фантом</td>\n",
+       "      <td>['боевик']</td>\n",
+       "      <td>Сериал расскажет о деятельности спецслужб Росс...</td>\n",
+       "      <td>12 +</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14973</th>\n",
+       "      <td>https://kino.mail.ru/series_773279_handerbi/</td>\n",
+       "      <td>https://resizer.mail.ru/p/a5ddec74-e1f8-512d-a...</td>\n",
+       "      <td>Хандерби</td>\n",
+       "      <td>['комедия']</td>\n",
+       "      <td>Сюжет сериала «Хандерби» начинается в 1831 год...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                     url  \\\n",
+       "14969        https://kino.mail.ru/series_893084_rusalka/   \n",
+       "14970    https://kino.mail.ru/series_838624_sezon_ohoti/   \n",
+       "14971  https://kino.mail.ru/series_783649_smertelnii_...   \n",
+       "14972         https://kino.mail.ru/series_781099_fantom/   \n",
+       "14973       https://kino.mail.ru/series_773279_handerbi/   \n",
+       "\n",
+       "                                                  poster              title  \\\n",
+       "14969  https://resizer.mail.ru/p/11575246-90fe-53c0-b...            Русалка   \n",
+       "14970  https://resizer.mail.ru/p/ca54339b-94e8-5813-a...        Сезон охоты   \n",
+       "14971  https://resizer.mail.ru/p/767707c0-af9c-588a-a...  Смертельный танец   \n",
+       "14972  https://resizer.mail.ru/p/07f60bae-b56a-58ea-b...             Фантом   \n",
+       "14973  https://resizer.mail.ru/p/a5ddec74-e1f8-512d-a...           Хандерби   \n",
+       "\n",
+       "                                                  ganres  \\\n",
+       "14969                                      ['мелодрама']   \n",
+       "14970  ['драма', 'мелодрама', 'комедия', 'для взрослых']   \n",
+       "14971                                       ['детектив']   \n",
+       "14972                                         ['боевик']   \n",
+       "14973                                        ['комедия']   \n",
+       "\n",
+       "                                             description age_limit  \\\n",
+       "14969  Наташа Алпатова (Елена Шилова) — простая девуш...      12 +   \n",
+       "14970  В центре сюжета — история молодого успешного б...      18 +   \n",
+       "14971  В родное Заречье возвращается танцовщица Настя...      16 +   \n",
+       "14972  Сериал расскажет о деятельности спецслужб Росс...      12 +   \n",
+       "14973  Сюжет сериала «Хандерби» начинается в 1831 год...       NaN   \n",
+       "\n",
+       "       Unnamed: 0.1  \n",
+       "14969           NaN  \n",
+       "14970           NaN  \n",
+       "14971           NaN  \n",
+       "14972           NaN  \n",
+       "14973           NaN  "
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85c382bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data['title'] = data['title'].apply(lambda x: re.sub(r'\\([^)]*\\)', ' ', x).strip() if isinstance(x, str) else x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe021810",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = data.drop(['Unnamed: 0.1'], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f317f7e3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 14973 entries, 0 to 14973\n",
+      "Data columns (total 6 columns):\n",
+      " #   Column       Non-Null Count  Dtype \n",
+      "---  ------       --------------  ----- \n",
+      " 0   url          14973 non-null  object\n",
+      " 1   poster       14785 non-null  object\n",
+      " 2   title        14785 non-null  object\n",
+      " 3   ganres       14973 non-null  object\n",
+      " 4   description  14730 non-null  object\n",
+      " 5   age_limit    13105 non-null  object\n",
+      "dtypes: object(6)\n",
+      "memory usage: 818.8+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "data.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57f21838",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = data.drop_duplicates()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da517ed0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.duplicated().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fcf403cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# data.to_csv('data.csv')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".elbrus2",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": true,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": true,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27918d930163e3ac3a4af281d3e0809afab3e1c7a4f4dfae2324614323db51d9
+size 1465

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6d346be366a7d1d48332dbc9fdf3bf8960b5d879522b7799ddba59e76237ee3
+size 125

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae57eda34a3d4e3bbab5edd30c5b7e4ee3c493fa48c2e1af1443b6bd619afc19
+size 1270

tokenizer/vocab.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78106a3d3ae8600d1ba573b967b9bb731d2c2282957cbc6e26ab20935c3da02b
+size 1649718