Анастасия commited on
Commit
582663d
·
1 Parent(s): e0a865c

project_streamlit_app

Browse files
.DS_Store ADDED
Binary file (8.2 kB). View file
 
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.csv filter=lfs diff=lfs merge=lfs -text
37
+ *.txt filter=lfs diff=lfs merge=lfs -text
38
+ *.json filter=lfs diff=lfs merge=lfs -text
39
+ *.png filter=lfs diff=lfs merge=lfs -text
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ """
4
+ ## Сервис умного поиска сериалов 📽️
5
+ #### Проект по рекомендательным системам.
6
+ """
7
+
8
+ st.image('images/ser2.png')
9
+
10
+ st.write("""
11
+ #### Состав команды:
12
+ \n1. ##### [Анастасия](https://github.com/AnastasiaMozhayskaya) 🧝‍♀️
13
+ \n2. ##### [Роман](https://github.com/r-makushkin) 🦸‍♂️
14
+ \n3. ##### [Алексей](https://github.com/WeinsGH) 🦹‍♂️
15
+ """, unsafe_allow_html=True)
16
+
17
+ """
18
+ #### Задачи:
19
+ \n- ###### Собрать выборку из не менее, чем 5000 сериалов
20
+ \n- ###### Разработать систему поиска сериала по пользовательскому запросу.
21
+ \n- ###### Описания сериалов на русском языке
22
+ \n- ###### Сервис должен принимать на вход описание сериала от \
23
+ пользователя и возвращать заданное количество подходящих вариантов.
24
+ """
cosine_similarity_model.ipynb ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/Users/Anastasia/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "from transformers import BertTokenizer, BertModel\n",
19
+ "import torch\n",
20
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
21
+ "import numpy as np\n",
22
+ "import time\n"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 3,
28
+ "metadata": {},
29
+ "outputs": [
30
+ {
31
+ "name": "stderr",
32
+ "output_type": "stream",
33
+ "text": [
34
+ "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
35
+ ]
36
+ },
37
+ {
38
+ "name": "stdout",
39
+ "output_type": "stream",
40
+ "text": [
41
+ "(array([ 5517, 9066, 13361, 11717, 320, 10793, 14201, 9305, 9199,\n",
42
+ " 8294]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))\n",
43
+ "3.533276081085205\n"
44
+ ]
45
+ }
46
+ ],
47
+ "source": [
48
+ "start_time = time.time()\n",
49
+ "\n",
50
+ "\n",
51
+ "# Читаем вектора сериалов\n",
52
+ "embeddings = np.loadtxt('data/embs.txt')\n",
53
+ "# Указываем пути к сохраненным модели и токенизатору\n",
54
+ "model_path = \"model\"\n",
55
+ "tokenizer_path = \"tokenizer\"\n",
56
+ "\n",
57
+ "# Загружаем модель\n",
58
+ "loaded_model = BertModel.from_pretrained(model_path)\n",
59
+ "\n",
60
+ "# Загружаем токенизатор\n",
61
+ "loaded_tokenizer = BertTokenizer.from_pretrained(tokenizer_path)\n",
62
+ "\n",
63
+ "\n",
64
+ "# Векторизуем запрос\n",
65
+ "loaded_model.eval()\n",
66
+ "tokens = loaded_tokenizer('петух закукарекал', return_tensors=\"pt\", padding=True, truncation=True)\n",
67
+ "\n",
68
+ "# Переместите токены на тот же устройство, что и модель\n",
69
+ "tokens = {key: value.to(loaded_model.device) for key, value in tokens.items()}\n",
70
+ "\n",
71
+ "# Передача токенов в модель для получения эмбеддингов\n",
72
+ "with torch.no_grad():\n",
73
+ " output = loaded_model(**tokens)\n",
74
+ "\n",
75
+ "# Эмбеддинги получаются из последнего скрытого состояния\n",
76
+ "user_embedding = output.last_hidden_state.mean(dim=1).squeeze().cpu().detach().numpy()\n",
77
+ "\n",
78
+ "\n",
79
+ "\n",
80
+ "cosine_similarities = cosine_similarity(embeddings, user_embedding.reshape(1, -1))\n",
81
+ "\n",
82
+ "# Получаем 10 наиболее подходящих строк-индексов в массиве нампай\n",
83
+ "top_10_indices = np.unravel_index(np.argsort(cosine_similarities, axis=None)[-10:], cosine_similarities.shape)\n",
84
+ "print(top_10_indices)\n",
85
+ "end_time = time.time()\n",
86
+ "print(end_time-start_time)"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 10,
92
+ "metadata": {},
93
+ "outputs": [
94
+ {
95
+ "data": {
96
+ "text/plain": [
97
+ "[5517, 9066, 13361, 11717, 320, 10793, 14201, 9305, 9199, 8294]"
98
+ ]
99
+ },
100
+ "execution_count": 10,
101
+ "metadata": {},
102
+ "output_type": "execute_result"
103
+ }
104
+ ],
105
+ "source": [
106
+ "list(top_10_indices[0])"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": null,
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": []
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": null,
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": []
122
+ }
123
+ ],
124
+ "metadata": {
125
+ "kernelspec": {
126
+ "display_name": ".elbrus2",
127
+ "language": "python",
128
+ "name": "python3"
129
+ },
130
+ "language_info": {
131
+ "name": "python",
132
+ "version": "3.10.12"
133
+ }
134
+ },
135
+ "nbformat": 4,
136
+ "nbformat_minor": 2
137
+ }
data/.DS_Store ADDED
Binary file (6.15 kB). View file
 
data/data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c10dbf7a899fbf0553bf6cab5fd11abf35cf224e4e6e4f7843fdd19144c550
3
+ size 19266108
data/embs.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:738ec3d829fa4fe69441898cc7f16d3db560af029831428504c7326a5f4de3cf
3
+ size 292731853
images/.DS_Store ADDED
Binary file (6.15 kB). View file
 
images/ser2.png ADDED

Git LFS Details

  • SHA256: 035a31442decd33706b2ffff57c59ef4a2363970e4c2c9e91d6a5efef4dd9191
  • Pointer size: 132 Bytes
  • Size of remote file: 1.34 MB
model/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47128322e633a57168f608258f7409ee2890186641e008059bcf1ba3010f3a61
3
+ size 829
model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5726147049874a1efbf83c95cc45f74e24267f501998cdcecea06b474e9d16f
3
+ size 711436136
pages/.DS_Store ADDED
Binary file (6.15 kB). View file
 
pages/01_🎥_Serials.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import ast
5
+ import random
6
+ import torch
7
+ import time
8
+ from joblib import load
9
+
10
+ from transformers import BertTokenizer, BertModel
11
+ from sklearn.metrics.pairwise import cosine_similarity
12
+ # import faiss
13
+ """
14
+ ## Сервис умного поиска сериалов 📽️
15
+ """
16
+
17
+ # Читаем вектора сериалов
18
+ embeddings = np.loadtxt('data/embs.txt')
19
+ # Указываем пути к сохраненным модели и токенизатору
20
+ model_path = "model"
21
+ tokenizer_path = "tokenizer"
22
+ # Загружаем модель
23
+ loaded_model = BertModel.from_pretrained(model_path)
24
+ # Загружаем токенизатор
25
+ loaded_tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
26
+
27
+ df = pd.read_csv('data/data.csv')
28
+ df['ganres'] = df['ganres'].apply(lambda x: ast.literal_eval(x))
29
+ df['description'] = df['description'].astype(str)
30
+
31
+ st.write(f'<p style="font-family: Arial, sans-serif; font-size: 24px; ">Наш сервис насчитывает \
32
+ {len(df)} лучших сериалов</p>', unsafe_allow_html=True)
33
+
34
+ st.image('images/ser2.png')
35
+
36
+ ganres_lst = sorted(['драма', 'документальный', 'биография', 'комедия', 'фэнтези', 'приключения', 'для детей', 'мультсериалы',
37
+ 'мелодрама', 'боевик', 'детектив', 'фантастика', 'триллер', 'семейный', 'криминал', 'исторический', 'музыкальные',
38
+ 'мистика', 'аниме', 'ужасы', 'спорт', 'скетч-шоу', 'военный', 'для взрослых', 'вестерн'])
39
+
40
+ st.sidebar.header('Панель инструментов :gear:')
41
+ choice_g = st.sidebar.multiselect("Выберите жанры", options=ganres_lst)
42
+ n = st.sidebar.selectbox("Количество отображаемых элементов на странице", options=[5, 10, 15, 20, 30])
43
+ st.sidebar.info("Для наилучшего соответствия, запрос должен быть максимально развернутым")
44
+
45
+ text = st.text_input('Введите описание для рекомендации')
46
+
47
+ # Векторизуем запрос
48
+ loaded_model.eval()
49
+ tokens = loaded_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
50
+ start_time = time.time()
51
+ tokens = {key: value.to(loaded_model.device) for key, value in tokens.items()}
52
+
53
+ # Передача токенов в модель для получения эмбеддингов
54
+ with torch.no_grad():
55
+ output = loaded_model(**tokens)
56
+
57
+ # Эмбеддинги получаются из последнего скрытого состояния
58
+ user_embedding = output.last_hidden_state.mean(dim=1).squeeze().cpu().detach().numpy()
59
+ cosine_similarities = cosine_similarity(embeddings, user_embedding.reshape(1, -1))
60
+
61
+ button = st.button('Отправить запрос', type="primary")
62
+
63
+ if text and button:
64
+
65
+ if len(choice_g) == 0:
66
+ choice_g = ganres_lst
67
+ # random = random.sample(range(len(df)), 50)
68
+ top_ind = np.unravel_index(np.argsort(cosine_similarities, axis=None)[-30:][::-1], cosine_similarities.shape)
69
+ confidence = cosine_similarities[top_ind]
70
+ top_ind = list(top_ind[0])
71
+ conf_dict = {}
72
+ for value, conf in zip(top_ind, confidence):
73
+ conf_dict[int(value)] = conf
74
+ # st.write(conf_dict)
75
+ output_dict = {}
76
+ for i in top_ind:
77
+ for ganre in df['ganres'][i]:
78
+ if ganre in choice_g:
79
+ output_dict[i] = df['ganres'][i]
80
+ # st.write('output_dict')
81
+ sorted_lst = sorted(output_dict.items(), key=lambda x: len(set(x[1]) & set(choice_g)), reverse=True)
82
+ n_lst = [i[0] for i in sorted_lst[:n]]
83
+ st.write(f'<p style="font-family: Arial, sans-serif; font-size: 18px; text-align: center;"><strong>Всего подобранных \
84
+ рекомендаций {len(sorted_lst)}</strong></p>', unsafe_allow_html=True)
85
+ st.write('\n')
86
+
87
+ # Отображение изображений и названий
88
+ for i in n_lst:
89
+ col1, col2 = st.columns([3, 4])
90
+ with col1:
91
+ st.image(df['poster'][i], width=300)
92
+ with col2:
93
+ st.write(f"***Название:*** {df['title'][i]}")
94
+ st.write(f"***Жанр:*** {', '.join(df['ganres'][i])}")
95
+ st.write(f"***Описание:*** {df['description'][i]}")
96
+ # similarity = float(confidence)
97
+ # st.write(f"***Cosine Similarity : {round(similarity, 3)}***")
98
+ st.write(f"***Ссылка на сериал: {df['url'][i]}***")
99
+ st.write(f"")
100
+ end_time = time.time()
101
+ st.write(f"<small>*Степень соответствия по косинусному сходству: {conf_dict[i]:.4f}*</small>", unsafe_allow_html=True)
102
+ st.markdown(
103
+ "<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
104
+ unsafe_allow_html=True
105
+ )
pages/02_🔥_Results.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+
4
+ st.write("""
5
+ ## 📝 Итоги проекта Рекомендательные системы.
6
+ """)
7
+ """
8
+ ###### 1. Парсинг профильных сайтов.
9
+ ###### 2. Сбор и анализ информации с киносервисов. Формирование датасета. Итоговый размер - 14939 объектов.
10
+ ###### 3. Предобработка данных от лишных символов и пропусков.
11
+ ###### 4. Векторизация с использованием модели rubert-tiny2.
12
+ """
13
+
14
+
15
+
pages/__init__.py ADDED
File without changes
pages/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (203 Bytes). View file
 
parsing.ipynb ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "29cc0f22",
6
+ "metadata": {
7
+ "toc": true
8
+ },
9
+ "source": [
10
+ "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
11
+ "<div class=\"toc\"><ul class=\"toc-item\"></ul></div>"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 3,
17
+ "id": "fd5af781",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "from bs4 import BeautifulSoup\n",
22
+ "import requests\n",
23
+ "import pandas as pd\n",
24
+ "import re\n",
25
+ "import fake_useragent\n",
26
+ "import time"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 4,
32
+ "id": "b6333adb",
33
+ "metadata": {},
34
+ "outputs": [],
35
+ "source": [
36
+ "user = fake_useragent.UserAgent().random"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 5,
42
+ "id": "db46f2ff",
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "posters = []\n",
47
+ "titles = []\n",
48
+ "sources = []\n",
49
+ "descriptions = []\n",
50
+ "ganres = []\n",
51
+ "filters = []\n",
52
+ "\n",
53
+ "with open('serials.txt') as file:\n",
54
+ " lst = file.read().split('\\n')\n",
55
+ "s = 10000\n",
56
+ "for url in lst[10000:]:\n",
57
+ " headers = {\n",
58
+ " 'Accept': '*/*',\n",
59
+ " 'User-Agent': user\n",
60
+ " }\n",
61
+ " req = requests.get(url, headers=headers)\n",
62
+ " # req = requests.get(url)\n",
63
+ " src = req.text\n",
64
+ " s +=1\n",
65
+ " print(s)\n",
66
+ "\n",
67
+ " with open('index.html', 'w', encoding=\"utf-8\") as file:\n",
68
+ " file.write(src)\n",
69
+ "\n",
70
+ " with open('index.html', encoding=\"utf-8\") as file:\n",
71
+ " src = file.read()\n",
72
+ "\n",
73
+ " soup = BeautifulSoup(src, 'lxml')\n",
74
+ " # тянем название\n",
75
+ " try:\n",
76
+ " title = soup.find(class_='text text_bold_giant color_white').text\n",
77
+ " title = re.sub(r'\\([^)]*\\)', ' ', title).strip()\n",
78
+ " titles.append(title)\n",
79
+ " except:\n",
80
+ " titles.append(None)\n",
81
+ " # тянем постер\n",
82
+ " try:\n",
83
+ " picture_url = soup.find('meta', itemprop='image')\n",
84
+ " picture_url = picture_url['content']\n",
85
+ " posters.append(picture_url)\n",
86
+ " except:\n",
87
+ " posters.append(None)\n",
88
+ " # тянем жанры\n",
89
+ " ganre = soup.find_all('span', class_='badge__text')\n",
90
+ " helper = []\n",
91
+ " for i in ganre:\n",
92
+ " helper.append(i.text)\n",
93
+ " ganres.append(helper)\n",
94
+ " # тянем описание\n",
95
+ " try:\n",
96
+ " description = soup.find('div', class_='p-movie-info__description-text').text\n",
97
+ " descriptions.append(description)\n",
98
+ " except:\n",
99
+ " descriptions.append(None)\n",
100
+ " # возраст\n",
101
+ " try:\n",
102
+ " age_filter = soup.find('span', class_='label_restrict').text\n",
103
+ " filters.append(age_filter)\n",
104
+ " except:\n",
105
+ " filters.append(None)\n",
106
+ " # url\n",
107
+ " sources.append(url)\n",
108
+ " if len(sources) % 1 == 0:\n",
109
+ " res = pd.DataFrame({'url':sources,\n",
110
+ " 'poster':posters,\n",
111
+ " 'title':titles,\n",
112
+ " 'ganres':ganres,\n",
113
+ " 'description':descriptions,\n",
114
+ " 'age_limit':filters})\n",
115
+ " print(f'{len(res)} saved')\n",
116
+ " res.to_csv('DATA.csv')"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 6,
122
+ "id": "1cbe183b",
123
+ "metadata": {},
124
+ "outputs": [
125
+ {
126
+ "ename": "FileNotFoundError",
127
+ "evalue": "[Errno 2] No such file or directory: 'DATA1.csv'",
128
+ "output_type": "error",
129
+ "traceback": [
130
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
131
+ "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
132
+ "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df1 \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mDATA1.csv\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mUnnamed: 0\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m df2 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDATA2.csv\u001b[39m\u001b[38;5;124m'\u001b[39m, index_col\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUnnamed: 0\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 3\u001b[0m df3 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDATA3.csv\u001b[39m\u001b[38;5;124m'\u001b[39m, index_col\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUnnamed: 0\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
133
+ "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:948\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 935\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 936\u001b[0m dialect,\n\u001b[1;32m 937\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 944\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 945\u001b[0m )\n\u001b[1;32m 946\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 948\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
134
+ "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:611\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 608\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 610\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 611\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 613\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 614\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
135
+ "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1448\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1447\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1448\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
136
+ "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1705\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1703\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1704\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1705\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1706\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1707\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1708\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1709\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1710\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1711\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1712\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1713\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1714\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1715\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1716\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
137
+ "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/common.py:863\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 859\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 860\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 861\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 862\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 863\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 864\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 865\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 866\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 867\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 868\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 869\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 871\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 872\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
138
+ "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'DATA1.csv'"
139
+ ]
140
+ }
141
+ ],
142
+ "source": [
143
+ "df1 = pd.read_csv('DATA1.csv', index_col='Unnamed: 0')\n",
144
+ "df2 = pd.read_csv('DATA2.csv', index_col='Unnamed: 0')\n",
145
+ "df3 = pd.read_csv('DATA3.csv', index_col='Unnamed: 0')\n",
146
+ "df4 = pd.read_csv('DATA4.csv', index_col='Unnamed: 0')\n",
147
+ "df5 = pd.read_csv('DATA5.csv', index_col='Unnamed: 0')\n",
148
+ "df6 = pd.read_csv('DATA6.csv', index_col='Unnamed: 0')\n",
149
+ "df7 = pd.read_csv('DATA7.csv', index_col='Unnamed: 0')\n",
150
+ "df8 = pd.read_csv('DATA8.csv', index_col='Unnamed: 0')\n",
151
+ "df9 = pd.read_csv('DATA9.csv', index_col='Unnamed: 0')\n",
152
+ "df10 = pd.read_csv('DATA10.csv', index_col='Unnamed: 0')\n",
153
+ "df11 = pd.read_csv('DATA11.csv', index_col='Unnamed: 0')\n",
154
+ "df12 = pd.read_csv('DATA12.csv', index_col='Unnamed: 0')\n",
155
+ "df13 = pd.read_csv('DATA13.csv', index_col='Unnamed: 0')\n",
156
+ "df14 = pd.read_csv('DATA14.csv', index_col='Unnamed: 0')\n",
157
+ "df15 = pd.read_csv('DATA15 - с 11880.csv', index_col='Unnamed: 0')\n",
158
+ "df16 = pd.read_csv('DATA16.csv', index_col='Unnamed: 0')\n",
159
+ "df17 = pd.read_csv('DATA17.csv', index_col='Unnamed: 0')\n",
160
+ "df18 = pd.read_csv('DATA18.csv', index_col='Unnamed: 0')\n",
161
+ "df19 = pd.read_csv('DATA19.csv', index_col='Unnamed: 0')\n",
162
+ "df20 = pd.read_csv('DATA20.csv', index_col='Unnamed: 0')\n",
163
+ "df21 = pd.read_csv('DATA21.csv', index_col='Unnamed: 0')\n",
164
+ "df22 = pd.read_csv('DATA22.csv', index_col='Unnamed: 0')\n",
165
+ "df23 = pd.read_csv('DATA23.csv', index_col='Unnamed: 0')\n",
166
+ "df24 = pd.read_csv('DATA24.csv', index_col='Unnamed: 0')\n",
167
+ "df25 = pd.read_csv('DATA25.csv', index_col='Unnamed: 0')\n",
168
+ "df26 = pd.read_csv('DATA26.csv', index_col='Unnamed: 0')\n",
169
+ "df27 = pd.read_csv('DATA27.csv', index_col='Unnamed: 0')\n",
170
+ "df28 = pd.read_csv('DATA0-5000.csv', index_col='Unnamed: 0')\n",
171
+ "df29 = pd.read_csv('DATA2-8.csv', index_col='Unnamed: 0')"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": null,
177
+ "id": "8cda987f",
178
+ "metadata": {},
179
+ "outputs": [],
180
+ "source": [
181
+ "data = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15, df16, df17, df18, df19, df20, \\\n",
182
+ " df21, df22, df23, df24, df25, df26, df27, df28, df29], axis=0).reset_index(drop=True)"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "id": "222e1aef",
189
+ "metadata": {},
190
+ "outputs": [
191
+ {
192
+ "data": {
193
+ "text/html": [
194
+ "<div>\n",
195
+ "<style scoped>\n",
196
+ " .dataframe tbody tr th:only-of-type {\n",
197
+ " vertical-align: middle;\n",
198
+ " }\n",
199
+ "\n",
200
+ " .dataframe tbody tr th {\n",
201
+ " vertical-align: top;\n",
202
+ " }\n",
203
+ "\n",
204
+ " .dataframe thead th {\n",
205
+ " text-align: right;\n",
206
+ " }\n",
207
+ "</style>\n",
208
+ "<table border=\"1\" class=\"dataframe\">\n",
209
+ " <thead>\n",
210
+ " <tr style=\"text-align: right;\">\n",
211
+ " <th></th>\n",
212
+ " <th>url</th>\n",
213
+ " <th>poster</th>\n",
214
+ " <th>title</th>\n",
215
+ " <th>ganres</th>\n",
216
+ " <th>description</th>\n",
217
+ " <th>age_limit</th>\n",
218
+ " <th>Unnamed: 0.1</th>\n",
219
+ " </tr>\n",
220
+ " </thead>\n",
221
+ " <tbody>\n",
222
+ " <tr>\n",
223
+ " <th>14969</th>\n",
224
+ " <td>https://kino.mail.ru/series_893084_rusalka/</td>\n",
225
+ " <td>https://resizer.mail.ru/p/11575246-90fe-53c0-b...</td>\n",
226
+ " <td>Русалка</td>\n",
227
+ " <td>['мелодрама']</td>\n",
228
+ " <td>Наташа Алпатова (Елена Шилова) — простая девуш...</td>\n",
229
+ " <td>12 +</td>\n",
230
+ " <td>NaN</td>\n",
231
+ " </tr>\n",
232
+ " <tr>\n",
233
+ " <th>14970</th>\n",
234
+ " <td>https://kino.mail.ru/series_838624_sezon_ohoti/</td>\n",
235
+ " <td>https://resizer.mail.ru/p/ca54339b-94e8-5813-a...</td>\n",
236
+ " <td>Сезон охоты</td>\n",
237
+ " <td>['драма', 'мелодрама', 'комедия', 'для взрослых']</td>\n",
238
+ " <td>В центре сюжета — история молодого успешного б...</td>\n",
239
+ " <td>18 +</td>\n",
240
+ " <td>NaN</td>\n",
241
+ " </tr>\n",
242
+ " <tr>\n",
243
+ " <th>14971</th>\n",
244
+ " <td>https://kino.mail.ru/series_783649_smertelnii_...</td>\n",
245
+ " <td>https://resizer.mail.ru/p/767707c0-af9c-588a-a...</td>\n",
246
+ " <td>Смертельный танец</td>\n",
247
+ " <td>['детектив']</td>\n",
248
+ " <td>В родное Заречье возвращается танцовщица Настя...</td>\n",
249
+ " <td>16 +</td>\n",
250
+ " <td>NaN</td>\n",
251
+ " </tr>\n",
252
+ " <tr>\n",
253
+ " <th>14972</th>\n",
254
+ " <td>https://kino.mail.ru/series_781099_fantom/</td>\n",
255
+ " <td>https://resizer.mail.ru/p/07f60bae-b56a-58ea-b...</td>\n",
256
+ " <td>Фантом</td>\n",
257
+ " <td>['боевик']</td>\n",
258
+ " <td>Сериал расскажет о деятельности спецслужб Росс...</td>\n",
259
+ " <td>12 +</td>\n",
260
+ " <td>NaN</td>\n",
261
+ " </tr>\n",
262
+ " <tr>\n",
263
+ " <th>14973</th>\n",
264
+ " <td>https://kino.mail.ru/series_773279_handerbi/</td>\n",
265
+ " <td>https://resizer.mail.ru/p/a5ddec74-e1f8-512d-a...</td>\n",
266
+ " <td>Хандерби</td>\n",
267
+ " <td>['комедия']</td>\n",
268
+ " <td>Сюжет сериала «Хандерби» начинается в 1831 год...</td>\n",
269
+ " <td>NaN</td>\n",
270
+ " <td>NaN</td>\n",
271
+ " </tr>\n",
272
+ " </tbody>\n",
273
+ "</table>\n",
274
+ "</div>"
275
+ ],
276
+ "text/plain": [
277
+ " url \\\n",
278
+ "14969 https://kino.mail.ru/series_893084_rusalka/ \n",
279
+ "14970 https://kino.mail.ru/series_838624_sezon_ohoti/ \n",
280
+ "14971 https://kino.mail.ru/series_783649_smertelnii_... \n",
281
+ "14972 https://kino.mail.ru/series_781099_fantom/ \n",
282
+ "14973 https://kino.mail.ru/series_773279_handerbi/ \n",
283
+ "\n",
284
+ " poster title \\\n",
285
+ "14969 https://resizer.mail.ru/p/11575246-90fe-53c0-b... Русалка \n",
286
+ "14970 https://resizer.mail.ru/p/ca54339b-94e8-5813-a... Сезон охоты \n",
287
+ "14971 https://resizer.mail.ru/p/767707c0-af9c-588a-a... Смертельный танец \n",
288
+ "14972 https://resizer.mail.ru/p/07f60bae-b56a-58ea-b... Фантом \n",
289
+ "14973 https://resizer.mail.ru/p/a5ddec74-e1f8-512d-a... Хандерби \n",
290
+ "\n",
291
+ " ganres \\\n",
292
+ "14969 ['мелодрама'] \n",
293
+ "14970 ['драма', 'мелодрама', 'комедия', 'для взрослых'] \n",
294
+ "14971 ['детектив'] \n",
295
+ "14972 ['боевик'] \n",
296
+ "14973 ['комедия'] \n",
297
+ "\n",
298
+ " description age_limit \\\n",
299
+ "14969 Наташа Алпатова (Елена Шилова) — простая девуш... 12 + \n",
300
+ "14970 В центре сюжета — история молодого успешного б... 18 + \n",
301
+ "14971 В родное Заречье возвращается танцовщица Настя... 16 + \n",
302
+ "14972 Сериал расскажет о деятельности спецслужб Росс... 12 + \n",
303
+ "14973 Сюжет сериала «Хандерби» начинается в 1831 год... NaN \n",
304
+ "\n",
305
+ " Unnamed: 0.1 \n",
306
+ "14969 NaN \n",
307
+ "14970 NaN \n",
308
+ "14971 NaN \n",
309
+ "14972 NaN \n",
310
+ "14973 NaN "
311
+ ]
312
+ },
313
+ "execution_count": 52,
314
+ "metadata": {},
315
+ "output_type": "execute_result"
316
+ }
317
+ ],
318
+ "source": [
319
+ "data.tail()"
320
+ ]
321
+ },
322
+ {
323
+ "cell_type": "code",
324
+ "execution_count": null,
325
+ "id": "85c382bb",
326
+ "metadata": {},
327
+ "outputs": [],
328
+ "source": [
329
+ "data['title'] = data['title'].apply(lambda x: re.sub(r'\\([^)]*\\)', ' ', x).strip() if isinstance(x, str) else x)"
330
+ ]
331
+ },
332
+ {
333
+ "cell_type": "code",
334
+ "execution_count": null,
335
+ "id": "fe021810",
336
+ "metadata": {},
337
+ "outputs": [],
338
+ "source": [
339
+ "data = data.drop(['Unnamed: 0.1'], axis=1)"
340
+ ]
341
+ },
342
+ {
343
+ "cell_type": "code",
344
+ "execution_count": null,
345
+ "id": "f317f7e3",
346
+ "metadata": {},
347
+ "outputs": [
348
+ {
349
+ "name": "stdout",
350
+ "output_type": "stream",
351
+ "text": [
352
+ "<class 'pandas.core.frame.DataFrame'>\n",
353
+ "Index: 14973 entries, 0 to 14973\n",
354
+ "Data columns (total 6 columns):\n",
355
+ " # Column Non-Null Count Dtype \n",
356
+ "--- ------ -------------- ----- \n",
357
+ " 0 url 14973 non-null object\n",
358
+ " 1 poster 14785 non-null object\n",
359
+ " 2 title 14785 non-null object\n",
360
+ " 3 ganres 14973 non-null object\n",
361
+ " 4 description 14730 non-null object\n",
362
+ " 5 age_limit 13105 non-null object\n",
363
+ "dtypes: object(6)\n",
364
+ "memory usage: 818.8+ KB\n"
365
+ ]
366
+ }
367
+ ],
368
+ "source": [
369
+ "data.info()"
370
+ ]
371
+ },
372
+ {
373
+ "cell_type": "code",
374
+ "execution_count": null,
375
+ "id": "57f21838",
376
+ "metadata": {},
377
+ "outputs": [],
378
+ "source": [
379
+ "data = data.drop_duplicates()"
380
+ ]
381
+ },
382
+ {
383
+ "cell_type": "code",
384
+ "execution_count": null,
385
+ "id": "da517ed0",
386
+ "metadata": {},
387
+ "outputs": [
388
+ {
389
+ "data": {
390
+ "text/plain": [
391
+ "0"
392
+ ]
393
+ },
394
+ "execution_count": 58,
395
+ "metadata": {},
396
+ "output_type": "execute_result"
397
+ }
398
+ ],
399
+ "source": [
400
+ "data.duplicated().sum()"
401
+ ]
402
+ },
403
+ {
404
+ "cell_type": "code",
405
+ "execution_count": null,
406
+ "id": "fcf403cf",
407
+ "metadata": {},
408
+ "outputs": [],
409
+ "source": [
410
+ "# data.to_csv('data.csv')"
411
+ ]
412
+ }
413
+ ],
414
+ "metadata": {
415
+ "kernelspec": {
416
+ "display_name": ".elbrus2",
417
+ "language": "python",
418
+ "name": "python3"
419
+ },
420
+ "language_info": {
421
+ "codemirror_mode": {
422
+ "name": "ipython",
423
+ "version": 3
424
+ },
425
+ "file_extension": ".py",
426
+ "mimetype": "text/x-python",
427
+ "name": "python",
428
+ "nbconvert_exporter": "python",
429
+ "pygments_lexer": "ipython3",
430
+ "version": "3.10.12"
431
+ },
432
+ "toc": {
433
+ "base_numbering": 1,
434
+ "nav_menu": {},
435
+ "number_sections": true,
436
+ "sideBar": true,
437
+ "skip_h1_title": true,
438
+ "title_cell": "Table of Contents",
439
+ "title_sidebar": "Contents",
440
+ "toc_cell": true,
441
+ "toc_position": {},
442
+ "toc_section_display": true,
443
+ "toc_window_display": false
444
+ }
445
+ },
446
+ "nbformat": 4,
447
+ "nbformat_minor": 5
448
+ }
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27918d930163e3ac3a4af281d3e0809afab3e1c7a4f4dfae2324614323db51d9
3
+ size 1465
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6d346be366a7d1d48332dbc9fdf3bf8960b5d879522b7799ddba59e76237ee3
3
+ size 125
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae57eda34a3d4e3bbab5edd30c5b7e4ee3c493fa48c2e1af1443b6bd619afc19
3
+ size 1270
tokenizer/vocab.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78106a3d3ae8600d1ba573b967b9bb731d2c2282957cbc6e26ab20935c3da02b
3
+ size 1649718