Анастасия
commited on
Commit
·
582663d
1
Parent(s):
e0a865c
project_streamlit_app
Browse files- .DS_Store +0 -0
- .gitattributes +4 -0
- __init__.py +0 -0
- app.py +24 -0
- cosine_similarity_model.ipynb +137 -0
- data/.DS_Store +0 -0
- data/data.csv +3 -0
- data/embs.txt +3 -0
- images/.DS_Store +0 -0
- images/ser2.png +3 -0
- model/config.json +3 -0
- model/model.safetensors +3 -0
- pages/.DS_Store +0 -0
- pages/01_🎥_Serials.py +105 -0
- pages/02_🔥_Results.py +15 -0
- pages/__init__.py +0 -0
- pages/__pycache__/__init__.cpython-310.pyc +0 -0
- parsing.ipynb +448 -0
- requirements.txt +3 -0
- tokenizer/special_tokens_map.json +3 -0
- tokenizer/tokenizer_config.json +3 -0
- tokenizer/vocab.txt +3 -0
.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
.gitattributes
CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.csv filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.txt filter=lfs diff=lfs merge=lfs -text
|
38 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
__init__.py
ADDED
File without changes
|
app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
"""
|
4 |
+
## Сервис умного поиска сериалов 📽️
|
5 |
+
#### Проект по рекомендательным системам.
|
6 |
+
"""
|
7 |
+
|
8 |
+
st.image('images/ser2.png')
|
9 |
+
|
10 |
+
st.write("""
|
11 |
+
#### Состав команды:
|
12 |
+
\n1. ##### [Анастасия](https://github.com/AnastasiaMozhayskaya) 🧝♀️
|
13 |
+
\n2. ##### [Роман](https://github.com/r-makushkin) 🦸♂️
|
14 |
+
\n3. ##### [Алексей](https://github.com/WeinsGH) 🦹♂️
|
15 |
+
""", unsafe_allow_html=True)
|
16 |
+
|
17 |
+
"""
|
18 |
+
#### Задачи:
|
19 |
+
\n- ###### Собрать выборку из не менее, чем 5000 сериалов
|
20 |
+
\n- ###### Разработать систему поиска сериала по пользовательскому запросу.
|
21 |
+
\n- ###### Описания сериалов на русском языке
|
22 |
+
\n- ###### Сервис должен принимать на вход описание сериала от \
|
23 |
+
пользователя и возвращать заданное количество подходящих вариантов.
|
24 |
+
"""
|
cosine_similarity_model.ipynb
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"/Users/Anastasia/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
13 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
14 |
+
]
|
15 |
+
}
|
16 |
+
],
|
17 |
+
"source": [
|
18 |
+
"from transformers import BertTokenizer, BertModel\n",
|
19 |
+
"import torch\n",
|
20 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
21 |
+
"import numpy as np\n",
|
22 |
+
"import time\n"
|
23 |
+
]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"cell_type": "code",
|
27 |
+
"execution_count": 3,
|
28 |
+
"metadata": {},
|
29 |
+
"outputs": [
|
30 |
+
{
|
31 |
+
"name": "stderr",
|
32 |
+
"output_type": "stream",
|
33 |
+
"text": [
|
34 |
+
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
|
35 |
+
]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"name": "stdout",
|
39 |
+
"output_type": "stream",
|
40 |
+
"text": [
|
41 |
+
"(array([ 5517, 9066, 13361, 11717, 320, 10793, 14201, 9305, 9199,\n",
|
42 |
+
" 8294]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))\n",
|
43 |
+
"3.533276081085205\n"
|
44 |
+
]
|
45 |
+
}
|
46 |
+
],
|
47 |
+
"source": [
|
48 |
+
"start_time = time.time()\n",
|
49 |
+
"\n",
|
50 |
+
"\n",
|
51 |
+
"# Читаем вектора сериалов\n",
|
52 |
+
"embeddings = np.loadtxt('data/embs.txt')\n",
|
53 |
+
"# Указываем пути к сохраненным модели и токенизатору\n",
|
54 |
+
"model_path = \"model\"\n",
|
55 |
+
"tokenizer_path = \"tokenizer\"\n",
|
56 |
+
"\n",
|
57 |
+
"# Загружаем модель\n",
|
58 |
+
"loaded_model = BertModel.from_pretrained(model_path)\n",
|
59 |
+
"\n",
|
60 |
+
"# Загружаем токенизатор\n",
|
61 |
+
"loaded_tokenizer = BertTokenizer.from_pretrained(tokenizer_path)\n",
|
62 |
+
"\n",
|
63 |
+
"\n",
|
64 |
+
"# Векторизуем запрос\n",
|
65 |
+
"loaded_model.eval()\n",
|
66 |
+
"tokens = loaded_tokenizer('петух закукарекал', return_tensors=\"pt\", padding=True, truncation=True)\n",
|
67 |
+
"\n",
|
68 |
+
"# Переместите токены на тот же устройство, что и модель\n",
|
69 |
+
"tokens = {key: value.to(loaded_model.device) for key, value in tokens.items()}\n",
|
70 |
+
"\n",
|
71 |
+
"# Передача токенов в модель для получения эмбеддингов\n",
|
72 |
+
"with torch.no_grad():\n",
|
73 |
+
" output = loaded_model(**tokens)\n",
|
74 |
+
"\n",
|
75 |
+
"# Эмбеддинги получаются из последнего скрытого состояния\n",
|
76 |
+
"user_embedding = output.last_hidden_state.mean(dim=1).squeeze().cpu().detach().numpy()\n",
|
77 |
+
"\n",
|
78 |
+
"\n",
|
79 |
+
"\n",
|
80 |
+
"cosine_similarities = cosine_similarity(embeddings, user_embedding.reshape(1, -1))\n",
|
81 |
+
"\n",
|
82 |
+
"# Получаем 10 наиболее подходящих строк-индексов в массиве нампай\n",
|
83 |
+
"top_10_indices = np.unravel_index(np.argsort(cosine_similarities, axis=None)[-10:], cosine_similarities.shape)\n",
|
84 |
+
"print(top_10_indices)\n",
|
85 |
+
"end_time = time.time()\n",
|
86 |
+
"print(end_time-start_time)"
|
87 |
+
]
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"cell_type": "code",
|
91 |
+
"execution_count": 10,
|
92 |
+
"metadata": {},
|
93 |
+
"outputs": [
|
94 |
+
{
|
95 |
+
"data": {
|
96 |
+
"text/plain": [
|
97 |
+
"[5517, 9066, 13361, 11717, 320, 10793, 14201, 9305, 9199, 8294]"
|
98 |
+
]
|
99 |
+
},
|
100 |
+
"execution_count": 10,
|
101 |
+
"metadata": {},
|
102 |
+
"output_type": "execute_result"
|
103 |
+
}
|
104 |
+
],
|
105 |
+
"source": [
|
106 |
+
"list(top_10_indices[0])"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"cell_type": "code",
|
111 |
+
"execution_count": null,
|
112 |
+
"metadata": {},
|
113 |
+
"outputs": [],
|
114 |
+
"source": []
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"cell_type": "code",
|
118 |
+
"execution_count": null,
|
119 |
+
"metadata": {},
|
120 |
+
"outputs": [],
|
121 |
+
"source": []
|
122 |
+
}
|
123 |
+
],
|
124 |
+
"metadata": {
|
125 |
+
"kernelspec": {
|
126 |
+
"display_name": ".elbrus2",
|
127 |
+
"language": "python",
|
128 |
+
"name": "python3"
|
129 |
+
},
|
130 |
+
"language_info": {
|
131 |
+
"name": "python",
|
132 |
+
"version": "3.10.12"
|
133 |
+
}
|
134 |
+
},
|
135 |
+
"nbformat": 4,
|
136 |
+
"nbformat_minor": 2
|
137 |
+
}
|
data/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
data/data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f6c10dbf7a899fbf0553bf6cab5fd11abf35cf224e4e6e4f7843fdd19144c550
|
3 |
+
size 19266108
|
data/embs.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:738ec3d829fa4fe69441898cc7f16d3db560af029831428504c7326a5f4de3cf
|
3 |
+
size 292731853
|
images/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
images/ser2.png
ADDED
![]() |
Git LFS Details
|
model/config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:47128322e633a57168f608258f7409ee2890186641e008059bcf1ba3010f3a61
|
3 |
+
size 829
|
model/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c5726147049874a1efbf83c95cc45f74e24267f501998cdcecea06b474e9d16f
|
3 |
+
size 711436136
|
pages/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
pages/01_🎥_Serials.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import ast
|
5 |
+
import random
|
6 |
+
import torch
|
7 |
+
import time
|
8 |
+
from joblib import load
|
9 |
+
|
10 |
+
from transformers import BertTokenizer, BertModel
|
11 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
12 |
+
# import faiss
|
13 |
+
"""
|
14 |
+
## Сервис умного поиска сериалов 📽️
|
15 |
+
"""
|
16 |
+
|
17 |
+
# Читаем вектора сериалов
|
18 |
+
embeddings = np.loadtxt('data/embs.txt')
|
19 |
+
# Указываем пути к сохраненным модели и токенизатору
|
20 |
+
model_path = "model"
|
21 |
+
tokenizer_path = "tokenizer"
|
22 |
+
# Загружаем модель
|
23 |
+
loaded_model = BertModel.from_pretrained(model_path)
|
24 |
+
# Загружаем токенизатор
|
25 |
+
loaded_tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
|
26 |
+
|
27 |
+
df = pd.read_csv('data/data.csv')
|
28 |
+
df['ganres'] = df['ganres'].apply(lambda x: ast.literal_eval(x))
|
29 |
+
df['description'] = df['description'].astype(str)
|
30 |
+
|
31 |
+
st.write(f'<p style="font-family: Arial, sans-serif; font-size: 24px; ">Наш сервис насчитывает \
|
32 |
+
{len(df)} лучших сериалов</p>', unsafe_allow_html=True)
|
33 |
+
|
34 |
+
st.image('images/ser2.png')
|
35 |
+
|
36 |
+
ganres_lst = sorted(['драма', 'документальный', 'биография', 'комедия', 'фэнтези', 'приключения', 'для детей', 'мультсериалы',
|
37 |
+
'мелодрама', 'боевик', 'детектив', 'фантастика', 'триллер', 'семейный', 'криминал', 'исторический', 'музыкальные',
|
38 |
+
'мистика', 'аниме', 'ужасы', 'спорт', 'скетч-шоу', 'военный', 'для взрослых', 'вестерн'])
|
39 |
+
|
40 |
+
st.sidebar.header('Панель инструментов :gear:')
|
41 |
+
choice_g = st.sidebar.multiselect("Выберите жанры", options=ganres_lst)
|
42 |
+
n = st.sidebar.selectbox("Количество отображаемых элементов на странице", options=[5, 10, 15, 20, 30])
|
43 |
+
st.sidebar.info("Для наилучшего соответствия, запрос должен быть максимально развернутым")
|
44 |
+
|
45 |
+
text = st.text_input('Введите описание для рекомендации')
|
46 |
+
|
47 |
+
# Векторизуем запрос
|
48 |
+
loaded_model.eval()
|
49 |
+
tokens = loaded_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
50 |
+
start_time = time.time()
|
51 |
+
tokens = {key: value.to(loaded_model.device) for key, value in tokens.items()}
|
52 |
+
|
53 |
+
# Передача токенов в модель для получения эмбеддингов
|
54 |
+
with torch.no_grad():
|
55 |
+
output = loaded_model(**tokens)
|
56 |
+
|
57 |
+
# Эмбеддинги получаются из последнего скрытого состояния
|
58 |
+
user_embedding = output.last_hidden_state.mean(dim=1).squeeze().cpu().detach().numpy()
|
59 |
+
cosine_similarities = cosine_similarity(embeddings, user_embedding.reshape(1, -1))
|
60 |
+
|
61 |
+
button = st.button('Отправить запрос', type="primary")
|
62 |
+
|
63 |
+
if text and button:
|
64 |
+
|
65 |
+
if len(choice_g) == 0:
|
66 |
+
choice_g = ganres_lst
|
67 |
+
# random = random.sample(range(len(df)), 50)
|
68 |
+
top_ind = np.unravel_index(np.argsort(cosine_similarities, axis=None)[-30:][::-1], cosine_similarities.shape)
|
69 |
+
confidence = cosine_similarities[top_ind]
|
70 |
+
top_ind = list(top_ind[0])
|
71 |
+
conf_dict = {}
|
72 |
+
for value, conf in zip(top_ind, confidence):
|
73 |
+
conf_dict[int(value)] = conf
|
74 |
+
# st.write(conf_dict)
|
75 |
+
output_dict = {}
|
76 |
+
for i in top_ind:
|
77 |
+
for ganre in df['ganres'][i]:
|
78 |
+
if ganre in choice_g:
|
79 |
+
output_dict[i] = df['ganres'][i]
|
80 |
+
# st.write('output_dict')
|
81 |
+
sorted_lst = sorted(output_dict.items(), key=lambda x: len(set(x[1]) & set(choice_g)), reverse=True)
|
82 |
+
n_lst = [i[0] for i in sorted_lst[:n]]
|
83 |
+
st.write(f'<p style="font-family: Arial, sans-serif; font-size: 18px; text-align: center;"><strong>Всего подобранных \
|
84 |
+
рекомендаций {len(sorted_lst)}</strong></p>', unsafe_allow_html=True)
|
85 |
+
st.write('\n')
|
86 |
+
|
87 |
+
# Отображение изображений и названий
|
88 |
+
for i in n_lst:
|
89 |
+
col1, col2 = st.columns([3, 4])
|
90 |
+
with col1:
|
91 |
+
st.image(df['poster'][i], width=300)
|
92 |
+
with col2:
|
93 |
+
st.write(f"***Название:*** {df['title'][i]}")
|
94 |
+
st.write(f"***Жанр:*** {', '.join(df['ganres'][i])}")
|
95 |
+
st.write(f"***Описание:*** {df['description'][i]}")
|
96 |
+
# similarity = float(confidence)
|
97 |
+
# st.write(f"***Cosine Similarity : {round(similarity, 3)}***")
|
98 |
+
st.write(f"***Ссылка на сериал: {df['url'][i]}***")
|
99 |
+
st.write(f"")
|
100 |
+
end_time = time.time()
|
101 |
+
st.write(f"<small>*Степень соответствия по косинусному сходству: {conf_dict[i]:.4f}*</small>", unsafe_allow_html=True)
|
102 |
+
st.markdown(
|
103 |
+
"<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
|
104 |
+
unsafe_allow_html=True
|
105 |
+
)
|
pages/02_🔥_Results.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from PIL import Image
|
3 |
+
|
4 |
+
st.write("""
|
5 |
+
## 📝 Итоги проекта Рекомендательные системы.
|
6 |
+
""")
|
7 |
+
"""
|
8 |
+
###### 1. Парсинг профильных сайтов.
|
9 |
+
###### 2. Сбор и анализ информации с киносервисов. Формирование датасета. Итоговый размер - 14939 объектов.
|
10 |
+
###### 3. Предобработка данных от лишных символов и пропусков.
|
11 |
+
###### 4. Векторизация с использованием модели rubert-tiny2.
|
12 |
+
"""
|
13 |
+
|
14 |
+
|
15 |
+
|
pages/__init__.py
ADDED
File without changes
|
pages/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (203 Bytes). View file
|
|
parsing.ipynb
ADDED
@@ -0,0 +1,448 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "29cc0f22",
|
6 |
+
"metadata": {
|
7 |
+
"toc": true
|
8 |
+
},
|
9 |
+
"source": [
|
10 |
+
"<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
|
11 |
+
"<div class=\"toc\"><ul class=\"toc-item\"></ul></div>"
|
12 |
+
]
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"cell_type": "code",
|
16 |
+
"execution_count": 3,
|
17 |
+
"id": "fd5af781",
|
18 |
+
"metadata": {},
|
19 |
+
"outputs": [],
|
20 |
+
"source": [
|
21 |
+
"from bs4 import BeautifulSoup\n",
|
22 |
+
"import requests\n",
|
23 |
+
"import pandas as pd\n",
|
24 |
+
"import re\n",
|
25 |
+
"import fake_useragent\n",
|
26 |
+
"import time"
|
27 |
+
]
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"cell_type": "code",
|
31 |
+
"execution_count": 4,
|
32 |
+
"id": "b6333adb",
|
33 |
+
"metadata": {},
|
34 |
+
"outputs": [],
|
35 |
+
"source": [
|
36 |
+
"user = fake_useragent.UserAgent().random"
|
37 |
+
]
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"cell_type": "code",
|
41 |
+
"execution_count": 5,
|
42 |
+
"id": "db46f2ff",
|
43 |
+
"metadata": {},
|
44 |
+
"outputs": [],
|
45 |
+
"source": [
|
46 |
+
"posters = []\n",
|
47 |
+
"titles = []\n",
|
48 |
+
"sources = []\n",
|
49 |
+
"descriptions = []\n",
|
50 |
+
"ganres = []\n",
|
51 |
+
"filters = []\n",
|
52 |
+
"\n",
|
53 |
+
"with open('serials.txt') as file:\n",
|
54 |
+
" lst = file.read().split('\\n')\n",
|
55 |
+
"s = 10000\n",
|
56 |
+
"for url in lst[10000:]:\n",
|
57 |
+
" headers = {\n",
|
58 |
+
" 'Accept': '*/*',\n",
|
59 |
+
" 'User-Agent': user\n",
|
60 |
+
" }\n",
|
61 |
+
" req = requests.get(url, headers=headers)\n",
|
62 |
+
" # req = requests.get(url)\n",
|
63 |
+
" src = req.text\n",
|
64 |
+
" s +=1\n",
|
65 |
+
" print(s)\n",
|
66 |
+
"\n",
|
67 |
+
" with open('index.html', 'w', encoding=\"utf-8\") as file:\n",
|
68 |
+
" file.write(src)\n",
|
69 |
+
"\n",
|
70 |
+
" with open('index.html', encoding=\"utf-8\") as file:\n",
|
71 |
+
" src = file.read()\n",
|
72 |
+
"\n",
|
73 |
+
" soup = BeautifulSoup(src, 'lxml')\n",
|
74 |
+
" # тянем название\n",
|
75 |
+
" try:\n",
|
76 |
+
" title = soup.find(class_='text text_bold_giant color_white').text\n",
|
77 |
+
" title = re.sub(r'\\([^)]*\\)', ' ', title).strip()\n",
|
78 |
+
" titles.append(title)\n",
|
79 |
+
" except:\n",
|
80 |
+
" titles.append(None)\n",
|
81 |
+
" # тянем постер\n",
|
82 |
+
" try:\n",
|
83 |
+
" picture_url = soup.find('meta', itemprop='image')\n",
|
84 |
+
" picture_url = picture_url['content']\n",
|
85 |
+
" posters.append(picture_url)\n",
|
86 |
+
" except:\n",
|
87 |
+
" posters.append(None)\n",
|
88 |
+
" # тянем жанры\n",
|
89 |
+
" ganre = soup.find_all('span', class_='badge__text')\n",
|
90 |
+
" helper = []\n",
|
91 |
+
" for i in ganre:\n",
|
92 |
+
" helper.append(i.text)\n",
|
93 |
+
" ganres.append(helper)\n",
|
94 |
+
" # тянем описание\n",
|
95 |
+
" try:\n",
|
96 |
+
" description = soup.find('div', class_='p-movie-info__description-text').text\n",
|
97 |
+
" descriptions.append(description)\n",
|
98 |
+
" except:\n",
|
99 |
+
" descriptions.append(None)\n",
|
100 |
+
" # возраст\n",
|
101 |
+
" try:\n",
|
102 |
+
" age_filter = soup.find('span', class_='label_restrict').text\n",
|
103 |
+
" filters.append(age_filter)\n",
|
104 |
+
" except:\n",
|
105 |
+
" filters.append(None)\n",
|
106 |
+
" # url\n",
|
107 |
+
" sources.append(url)\n",
|
108 |
+
" if len(sources) % 1 == 0:\n",
|
109 |
+
" res = pd.DataFrame({'url':sources,\n",
|
110 |
+
" 'poster':posters,\n",
|
111 |
+
" 'title':titles,\n",
|
112 |
+
" 'ganres':ganres,\n",
|
113 |
+
" 'description':descriptions,\n",
|
114 |
+
" 'age_limit':filters})\n",
|
115 |
+
" print(f'{len(res)} saved')\n",
|
116 |
+
" res.to_csv('DATA.csv')"
|
117 |
+
]
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"cell_type": "code",
|
121 |
+
"execution_count": 6,
|
122 |
+
"id": "1cbe183b",
|
123 |
+
"metadata": {},
|
124 |
+
"outputs": [
|
125 |
+
{
|
126 |
+
"ename": "FileNotFoundError",
|
127 |
+
"evalue": "[Errno 2] No such file or directory: 'DATA1.csv'",
|
128 |
+
"output_type": "error",
|
129 |
+
"traceback": [
|
130 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
131 |
+
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
132 |
+
"Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df1 \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mDATA1.csv\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mUnnamed: 0\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m df2 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDATA2.csv\u001b[39m\u001b[38;5;124m'\u001b[39m, index_col\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUnnamed: 0\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 3\u001b[0m df3 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDATA3.csv\u001b[39m\u001b[38;5;124m'\u001b[39m, index_col\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUnnamed: 0\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
|
133 |
+
"File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:948\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 935\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 936\u001b[0m dialect,\n\u001b[1;32m 937\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 944\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 945\u001b[0m )\n\u001b[1;32m 946\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 948\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
|
134 |
+
"File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:611\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 608\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 610\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 611\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 613\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 614\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
|
135 |
+
"File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1448\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1447\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1448\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
|
136 |
+
"File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1705\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1703\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1704\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1705\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1706\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1707\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1708\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1709\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1710\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1711\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1712\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1713\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1714\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1715\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1716\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
|
137 |
+
"File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/common.py:863\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 859\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 860\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 861\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 862\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 863\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 864\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 865\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 866\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 867\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 868\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 869\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 871\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 872\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
|
138 |
+
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'DATA1.csv'"
|
139 |
+
]
|
140 |
+
}
|
141 |
+
],
|
142 |
+
"source": [
|
143 |
+
"df1 = pd.read_csv('DATA1.csv', index_col='Unnamed: 0')\n",
|
144 |
+
"df2 = pd.read_csv('DATA2.csv', index_col='Unnamed: 0')\n",
|
145 |
+
"df3 = pd.read_csv('DATA3.csv', index_col='Unnamed: 0')\n",
|
146 |
+
"df4 = pd.read_csv('DATA4.csv', index_col='Unnamed: 0')\n",
|
147 |
+
"df5 = pd.read_csv('DATA5.csv', index_col='Unnamed: 0')\n",
|
148 |
+
"df6 = pd.read_csv('DATA6.csv', index_col='Unnamed: 0')\n",
|
149 |
+
"df7 = pd.read_csv('DATA7.csv', index_col='Unnamed: 0')\n",
|
150 |
+
"df8 = pd.read_csv('DATA8.csv', index_col='Unnamed: 0')\n",
|
151 |
+
"df9 = pd.read_csv('DATA9.csv', index_col='Unnamed: 0')\n",
|
152 |
+
"df10 = pd.read_csv('DATA10.csv', index_col='Unnamed: 0')\n",
|
153 |
+
"df11 = pd.read_csv('DATA11.csv', index_col='Unnamed: 0')\n",
|
154 |
+
"df12 = pd.read_csv('DATA12.csv', index_col='Unnamed: 0')\n",
|
155 |
+
"df13 = pd.read_csv('DATA13.csv', index_col='Unnamed: 0')\n",
|
156 |
+
"df14 = pd.read_csv('DATA14.csv', index_col='Unnamed: 0')\n",
|
157 |
+
"df15 = pd.read_csv('DATA15 - с 11880.csv', index_col='Unnamed: 0')\n",
|
158 |
+
"df16 = pd.read_csv('DATA16.csv', index_col='Unnamed: 0')\n",
|
159 |
+
"df17 = pd.read_csv('DATA17.csv', index_col='Unnamed: 0')\n",
|
160 |
+
"df18 = pd.read_csv('DATA18.csv', index_col='Unnamed: 0')\n",
|
161 |
+
"df19 = pd.read_csv('DATA19.csv', index_col='Unnamed: 0')\n",
|
162 |
+
"df20 = pd.read_csv('DATA20.csv', index_col='Unnamed: 0')\n",
|
163 |
+
"df21 = pd.read_csv('DATA21.csv', index_col='Unnamed: 0')\n",
|
164 |
+
"df22 = pd.read_csv('DATA22.csv', index_col='Unnamed: 0')\n",
|
165 |
+
"df23 = pd.read_csv('DATA23.csv', index_col='Unnamed: 0')\n",
|
166 |
+
"df24 = pd.read_csv('DATA24.csv', index_col='Unnamed: 0')\n",
|
167 |
+
"df25 = pd.read_csv('DATA25.csv', index_col='Unnamed: 0')\n",
|
168 |
+
"df26 = pd.read_csv('DATA26.csv', index_col='Unnamed: 0')\n",
|
169 |
+
"df27 = pd.read_csv('DATA27.csv', index_col='Unnamed: 0')\n",
|
170 |
+
"df28 = pd.read_csv('DATA0-5000.csv', index_col='Unnamed: 0')\n",
|
171 |
+
"df29 = pd.read_csv('DATA2-8.csv', index_col='Unnamed: 0')"
|
172 |
+
]
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"cell_type": "code",
|
176 |
+
"execution_count": null,
|
177 |
+
"id": "8cda987f",
|
178 |
+
"metadata": {},
|
179 |
+
"outputs": [],
|
180 |
+
"source": [
|
181 |
+
"data = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15, df16, df17, df18, df19, df20, \\\n",
|
182 |
+
" df21, df22, df23, df24, df25, df26, df27, df28, df29], axis=0).reset_index(drop=True)"
|
183 |
+
]
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"cell_type": "code",
|
187 |
+
"execution_count": null,
|
188 |
+
"id": "222e1aef",
|
189 |
+
"metadata": {},
|
190 |
+
"outputs": [
|
191 |
+
{
|
192 |
+
"data": {
|
193 |
+
"text/html": [
|
194 |
+
"<div>\n",
|
195 |
+
"<style scoped>\n",
|
196 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
197 |
+
" vertical-align: middle;\n",
|
198 |
+
" }\n",
|
199 |
+
"\n",
|
200 |
+
" .dataframe tbody tr th {\n",
|
201 |
+
" vertical-align: top;\n",
|
202 |
+
" }\n",
|
203 |
+
"\n",
|
204 |
+
" .dataframe thead th {\n",
|
205 |
+
" text-align: right;\n",
|
206 |
+
" }\n",
|
207 |
+
"</style>\n",
|
208 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
209 |
+
" <thead>\n",
|
210 |
+
" <tr style=\"text-align: right;\">\n",
|
211 |
+
" <th></th>\n",
|
212 |
+
" <th>url</th>\n",
|
213 |
+
" <th>poster</th>\n",
|
214 |
+
" <th>title</th>\n",
|
215 |
+
" <th>ganres</th>\n",
|
216 |
+
" <th>description</th>\n",
|
217 |
+
" <th>age_limit</th>\n",
|
218 |
+
" <th>Unnamed: 0.1</th>\n",
|
219 |
+
" </tr>\n",
|
220 |
+
" </thead>\n",
|
221 |
+
" <tbody>\n",
|
222 |
+
" <tr>\n",
|
223 |
+
" <th>14969</th>\n",
|
224 |
+
" <td>https://kino.mail.ru/series_893084_rusalka/</td>\n",
|
225 |
+
" <td>https://resizer.mail.ru/p/11575246-90fe-53c0-b...</td>\n",
|
226 |
+
" <td>Русалка</td>\n",
|
227 |
+
" <td>['мелодрама']</td>\n",
|
228 |
+
" <td>Наташа Алпатова (Елена Шилова) — простая девуш...</td>\n",
|
229 |
+
" <td>12 +</td>\n",
|
230 |
+
" <td>NaN</td>\n",
|
231 |
+
" </tr>\n",
|
232 |
+
" <tr>\n",
|
233 |
+
" <th>14970</th>\n",
|
234 |
+
" <td>https://kino.mail.ru/series_838624_sezon_ohoti/</td>\n",
|
235 |
+
" <td>https://resizer.mail.ru/p/ca54339b-94e8-5813-a...</td>\n",
|
236 |
+
" <td>Сезон охоты</td>\n",
|
237 |
+
" <td>['драма', 'мелодрама', 'комедия', 'для взрослых']</td>\n",
|
238 |
+
" <td>В центре сюжета — история молодого успешного б...</td>\n",
|
239 |
+
" <td>18 +</td>\n",
|
240 |
+
" <td>NaN</td>\n",
|
241 |
+
" </tr>\n",
|
242 |
+
" <tr>\n",
|
243 |
+
" <th>14971</th>\n",
|
244 |
+
" <td>https://kino.mail.ru/series_783649_smertelnii_...</td>\n",
|
245 |
+
" <td>https://resizer.mail.ru/p/767707c0-af9c-588a-a...</td>\n",
|
246 |
+
" <td>Смертельный танец</td>\n",
|
247 |
+
" <td>['детектив']</td>\n",
|
248 |
+
" <td>В родное Заречье возвращается танцовщица Настя...</td>\n",
|
249 |
+
" <td>16 +</td>\n",
|
250 |
+
" <td>NaN</td>\n",
|
251 |
+
" </tr>\n",
|
252 |
+
" <tr>\n",
|
253 |
+
" <th>14972</th>\n",
|
254 |
+
" <td>https://kino.mail.ru/series_781099_fantom/</td>\n",
|
255 |
+
" <td>https://resizer.mail.ru/p/07f60bae-b56a-58ea-b...</td>\n",
|
256 |
+
" <td>Фантом</td>\n",
|
257 |
+
" <td>['боевик']</td>\n",
|
258 |
+
" <td>Сериал расскажет о деятельности спецслужб Росс...</td>\n",
|
259 |
+
" <td>12 +</td>\n",
|
260 |
+
" <td>NaN</td>\n",
|
261 |
+
" </tr>\n",
|
262 |
+
" <tr>\n",
|
263 |
+
" <th>14973</th>\n",
|
264 |
+
" <td>https://kino.mail.ru/series_773279_handerbi/</td>\n",
|
265 |
+
" <td>https://resizer.mail.ru/p/a5ddec74-e1f8-512d-a...</td>\n",
|
266 |
+
" <td>Хандерби</td>\n",
|
267 |
+
" <td>['комедия']</td>\n",
|
268 |
+
" <td>Сюжет сериала «Хандерби» начинается в 1831 год...</td>\n",
|
269 |
+
" <td>NaN</td>\n",
|
270 |
+
" <td>NaN</td>\n",
|
271 |
+
" </tr>\n",
|
272 |
+
" </tbody>\n",
|
273 |
+
"</table>\n",
|
274 |
+
"</div>"
|
275 |
+
],
|
276 |
+
"text/plain": [
|
277 |
+
" url \\\n",
|
278 |
+
"14969 https://kino.mail.ru/series_893084_rusalka/ \n",
|
279 |
+
"14970 https://kino.mail.ru/series_838624_sezon_ohoti/ \n",
|
280 |
+
"14971 https://kino.mail.ru/series_783649_smertelnii_... \n",
|
281 |
+
"14972 https://kino.mail.ru/series_781099_fantom/ \n",
|
282 |
+
"14973 https://kino.mail.ru/series_773279_handerbi/ \n",
|
283 |
+
"\n",
|
284 |
+
" poster title \\\n",
|
285 |
+
"14969 https://resizer.mail.ru/p/11575246-90fe-53c0-b... Русалка \n",
|
286 |
+
"14970 https://resizer.mail.ru/p/ca54339b-94e8-5813-a... Сезон охоты \n",
|
287 |
+
"14971 https://resizer.mail.ru/p/767707c0-af9c-588a-a... Смертельный танец \n",
|
288 |
+
"14972 https://resizer.mail.ru/p/07f60bae-b56a-58ea-b... Фантом \n",
|
289 |
+
"14973 https://resizer.mail.ru/p/a5ddec74-e1f8-512d-a... Хандерби \n",
|
290 |
+
"\n",
|
291 |
+
" ganres \\\n",
|
292 |
+
"14969 ['мелодрама'] \n",
|
293 |
+
"14970 ['драма', 'мелодрама', 'комедия', 'для взрослых'] \n",
|
294 |
+
"14971 ['детектив'] \n",
|
295 |
+
"14972 ['боевик'] \n",
|
296 |
+
"14973 ['комедия'] \n",
|
297 |
+
"\n",
|
298 |
+
" description age_limit \\\n",
|
299 |
+
"14969 Наташа Алпатова (Елена Шилова) — простая девуш... 12 + \n",
|
300 |
+
"14970 В центре сюжета — история молодого успешного б... 18 + \n",
|
301 |
+
"14971 В родное Заречье возвращается танцовщица Настя... 16 + \n",
|
302 |
+
"14972 Сериал расскажет о деятельности спецслужб Росс... 12 + \n",
|
303 |
+
"14973 Сюжет сериала «Хандерби» начинается в 1831 год... NaN \n",
|
304 |
+
"\n",
|
305 |
+
" Unnamed: 0.1 \n",
|
306 |
+
"14969 NaN \n",
|
307 |
+
"14970 NaN \n",
|
308 |
+
"14971 NaN \n",
|
309 |
+
"14972 NaN \n",
|
310 |
+
"14973 NaN "
|
311 |
+
]
|
312 |
+
},
|
313 |
+
"execution_count": 52,
|
314 |
+
"metadata": {},
|
315 |
+
"output_type": "execute_result"
|
316 |
+
}
|
317 |
+
],
|
318 |
+
"source": [
|
319 |
+
"data.tail()"
|
320 |
+
]
|
321 |
+
},
|
322 |
+
{
|
323 |
+
"cell_type": "code",
|
324 |
+
"execution_count": null,
|
325 |
+
"id": "85c382bb",
|
326 |
+
"metadata": {},
|
327 |
+
"outputs": [],
|
328 |
+
"source": [
|
329 |
+
"data['title'] = data['title'].apply(lambda x: re.sub(r'\\([^)]*\\)', ' ', x).strip() if isinstance(x, str) else x)"
|
330 |
+
]
|
331 |
+
},
|
332 |
+
{
|
333 |
+
"cell_type": "code",
|
334 |
+
"execution_count": null,
|
335 |
+
"id": "fe021810",
|
336 |
+
"metadata": {},
|
337 |
+
"outputs": [],
|
338 |
+
"source": [
|
339 |
+
"data = data.drop(['Unnamed: 0.1'], axis=1)"
|
340 |
+
]
|
341 |
+
},
|
342 |
+
{
|
343 |
+
"cell_type": "code",
|
344 |
+
"execution_count": null,
|
345 |
+
"id": "f317f7e3",
|
346 |
+
"metadata": {},
|
347 |
+
"outputs": [
|
348 |
+
{
|
349 |
+
"name": "stdout",
|
350 |
+
"output_type": "stream",
|
351 |
+
"text": [
|
352 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
353 |
+
"Index: 14973 entries, 0 to 14973\n",
|
354 |
+
"Data columns (total 6 columns):\n",
|
355 |
+
" # Column Non-Null Count Dtype \n",
|
356 |
+
"--- ------ -------------- ----- \n",
|
357 |
+
" 0 url 14973 non-null object\n",
|
358 |
+
" 1 poster 14785 non-null object\n",
|
359 |
+
" 2 title 14785 non-null object\n",
|
360 |
+
" 3 ganres 14973 non-null object\n",
|
361 |
+
" 4 description 14730 non-null object\n",
|
362 |
+
" 5 age_limit 13105 non-null object\n",
|
363 |
+
"dtypes: object(6)\n",
|
364 |
+
"memory usage: 818.8+ KB\n"
|
365 |
+
]
|
366 |
+
}
|
367 |
+
],
|
368 |
+
"source": [
|
369 |
+
"data.info()"
|
370 |
+
]
|
371 |
+
},
|
372 |
+
{
|
373 |
+
"cell_type": "code",
|
374 |
+
"execution_count": null,
|
375 |
+
"id": "57f21838",
|
376 |
+
"metadata": {},
|
377 |
+
"outputs": [],
|
378 |
+
"source": [
|
379 |
+
"data = data.drop_duplicates()"
|
380 |
+
]
|
381 |
+
},
|
382 |
+
{
|
383 |
+
"cell_type": "code",
|
384 |
+
"execution_count": null,
|
385 |
+
"id": "da517ed0",
|
386 |
+
"metadata": {},
|
387 |
+
"outputs": [
|
388 |
+
{
|
389 |
+
"data": {
|
390 |
+
"text/plain": [
|
391 |
+
"0"
|
392 |
+
]
|
393 |
+
},
|
394 |
+
"execution_count": 58,
|
395 |
+
"metadata": {},
|
396 |
+
"output_type": "execute_result"
|
397 |
+
}
|
398 |
+
],
|
399 |
+
"source": [
|
400 |
+
"data.duplicated().sum()"
|
401 |
+
]
|
402 |
+
},
|
403 |
+
{
|
404 |
+
"cell_type": "code",
|
405 |
+
"execution_count": null,
|
406 |
+
"id": "fcf403cf",
|
407 |
+
"metadata": {},
|
408 |
+
"outputs": [],
|
409 |
+
"source": [
|
410 |
+
"# data.to_csv('data.csv')"
|
411 |
+
]
|
412 |
+
}
|
413 |
+
],
|
414 |
+
"metadata": {
|
415 |
+
"kernelspec": {
|
416 |
+
"display_name": ".elbrus2",
|
417 |
+
"language": "python",
|
418 |
+
"name": "python3"
|
419 |
+
},
|
420 |
+
"language_info": {
|
421 |
+
"codemirror_mode": {
|
422 |
+
"name": "ipython",
|
423 |
+
"version": 3
|
424 |
+
},
|
425 |
+
"file_extension": ".py",
|
426 |
+
"mimetype": "text/x-python",
|
427 |
+
"name": "python",
|
428 |
+
"nbconvert_exporter": "python",
|
429 |
+
"pygments_lexer": "ipython3",
|
430 |
+
"version": "3.10.12"
|
431 |
+
},
|
432 |
+
"toc": {
|
433 |
+
"base_numbering": 1,
|
434 |
+
"nav_menu": {},
|
435 |
+
"number_sections": true,
|
436 |
+
"sideBar": true,
|
437 |
+
"skip_h1_title": true,
|
438 |
+
"title_cell": "Table of Contents",
|
439 |
+
"title_sidebar": "Contents",
|
440 |
+
"toc_cell": true,
|
441 |
+
"toc_position": {},
|
442 |
+
"toc_section_display": true,
|
443 |
+
"toc_window_display": false
|
444 |
+
}
|
445 |
+
},
|
446 |
+
"nbformat": 4,
|
447 |
+
"nbformat_minor": 5
|
448 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:27918d930163e3ac3a4af281d3e0809afab3e1c7a4f4dfae2324614323db51d9
|
3 |
+
size 1465
|
tokenizer/special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b6d346be366a7d1d48332dbc9fdf3bf8960b5d879522b7799ddba59e76237ee3
|
3 |
+
size 125
|
tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae57eda34a3d4e3bbab5edd30c5b7e4ee3c493fa48c2e1af1443b6bd619afc19
|
3 |
+
size 1270
|
tokenizer/vocab.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78106a3d3ae8600d1ba573b967b9bb731d2c2282957cbc6e26ab20935c3da02b
|
3 |
+
size 1649718
|