Spaces:
Sleeping
Sleeping
initial
Browse files- app.py +60 -0
- data/book_embeddings.npy +3 -0
- data/book_embeddings_ms.npy +3 -0
- data/books_data_cleaned.csv +0 -0
- notebooks/parser.py +59 -0
- pages/recommendations.py +96 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
|
5 |
+
st.title('Рекомендация книг на основе пользовательского промта')
|
6 |
+
st.subheader('Основная информация')
|
7 |
+
|
8 |
+
def load_data(file_path):
|
9 |
+
return pd.read_csv(file_path)
|
10 |
+
df = load_data('data/books_data_cleaned.csv')
|
11 |
+
|
12 |
+
data = {
|
13 |
+
'Источник': ['chitai-gorod.ru'],
|
14 |
+
'Кол-во книг': ['5000'],
|
15 |
+
'Уникальных авторов': ['2112'],
|
16 |
+
'Жанр': ['Художественная лит-ра'],
|
17 |
+
'Время парсинга': ['77 минут'],
|
18 |
+
}
|
19 |
+
df1 = pd.DataFrame(data)
|
20 |
+
st.dataframe(df1)
|
21 |
+
st.subheader('Используемые модели:')
|
22 |
+
|
23 |
+
data1 = {
|
24 |
+
'sentence-transformers': ['<a href="https://huggingface.co/sentence-transformers/all-mpnet-base-v2" target="_blank">all-mpnet-base-v2</a>'],
|
25 |
+
'Pre-Trained MS MARCO Models': ['<a href="https://huggingface.co/sentence-transformers/msmarco-roberta-base-v3" target="_blank">msmarco-roberta-base-v3</a>'],
|
26 |
+
}
|
27 |
+
df2 = pd.DataFrame(data1)
|
28 |
+
st.markdown(df2.to_html(escape=False), unsafe_allow_html=True)
|
29 |
+
|
30 |
+
|
31 |
+
def show_random_books():
|
32 |
+
sample_df = df.sample(n=10)
|
33 |
+
num_books = len(sample_df)
|
34 |
+
num_rows = (num_books + 1) // 2
|
35 |
+
|
36 |
+
for i in range(num_rows):
|
37 |
+
cols = st.columns(4) # Создаем 4 колонки
|
38 |
+
for j in range(2):
|
39 |
+
index = i * 2 + j
|
40 |
+
if index < num_books:
|
41 |
+
row = sample_df.iloc[index]
|
42 |
+
if j == 0:
|
43 |
+
with cols[0]: # Первая колонка (обложка первой книги)
|
44 |
+
st.image(row['image_url'], width=200)
|
45 |
+
with cols[1]: # Вторая колонка (информация первой книги)
|
46 |
+
st.subheader(row['title'])
|
47 |
+
st.write(f"Автор: {row['author']}")
|
48 |
+
st.write(f"**Ссылка:** [книга]({row['page_url']})")
|
49 |
+
st.write("---")
|
50 |
+
elif j == 1:
|
51 |
+
with cols[2]: # Третья колонка (обложка второй книги)
|
52 |
+
st.image(row['image_url'], width=200)
|
53 |
+
with cols[3]: # Четвертая колонка (информация второй книги)
|
54 |
+
st.subheader(row['title'])
|
55 |
+
st.write(f"Автор: {row['author']}")
|
56 |
+
st.write(f"**Ссылка:** [книга]({row['page_url']})")
|
57 |
+
|
58 |
+
|
59 |
+
if st.button('Показать 10 случайных книг'):
|
60 |
+
show_random_books()
|
data/book_embeddings.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bbb86f342a62c07926d39da48daab2d4481d7fcf4437eae254eaa42ef8ac0e91
|
3 |
+
size 15283328
|
data/book_embeddings_ms.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d90b7ba8398f552f593df4688485b51d0aea57e9712a480878f79799d17a59c9
|
3 |
+
size 15283328
|
data/books_data_cleaned.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/parser.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
df = pd.DataFrame(columns=['page_url', 'image_url', 'author', 'title', 'annotation'])
|
6 |
+
|
7 |
+
def extract_data_from_page(page_number):
|
8 |
+
url = f'https://www.chitai-gorod.ru/catalog/books/hudozhestvennaya-literatura-110001?page={page_number}'
|
9 |
+
response = requests.get(url)
|
10 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
11 |
+
books = soup.find_all('article', class_='product-card')
|
12 |
+
|
13 |
+
data = []
|
14 |
+
for book in books:
|
15 |
+
try:
|
16 |
+
book_url = book.find('a', class_='product-card__picture')['href']
|
17 |
+
title = book.find('div', class_='product-title__head').get_text(strip=True)
|
18 |
+
author = book.find('div', class_='product-title__author').get_text(strip=True)
|
19 |
+
absolute_url = f'https://www.chitai-gorod.ru{book_url}'
|
20 |
+
|
21 |
+
data.append({'page_url': absolute_url, 'title': title, 'author': author})
|
22 |
+
except Exception as e:
|
23 |
+
print(f"Error processing book: {e}")
|
24 |
+
|
25 |
+
return data
|
26 |
+
|
27 |
+
|
28 |
+
for page in range(2, 201):
|
29 |
+
print(f"Processing page {page}...")
|
30 |
+
page_data = extract_data_from_page(page)
|
31 |
+
df = pd.concat([df, pd.DataFrame(page_data)], ignore_index=True)
|
32 |
+
if len(df) >= 5000:
|
33 |
+
break
|
34 |
+
|
35 |
+
df = df.head(5000)
|
36 |
+
|
37 |
+
|
38 |
+
def extract_book_details(book_url):
|
39 |
+
try:
|
40 |
+
response = requests.get(book_url)
|
41 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
42 |
+
image_tag = soup.find('meta', {'name': 'og:image'})
|
43 |
+
image_url = image_tag['content'] if image_tag else None
|
44 |
+
annotation_tag = soup.find('div', {'itemprop': 'description'})
|
45 |
+
annotation = annotation_tag.get_text(strip=True) if annotation_tag else None
|
46 |
+
|
47 |
+
return image_url, annotation
|
48 |
+
except Exception as e:
|
49 |
+
print(f"Error extracting details from {book_url}: {e}")
|
50 |
+
return None, None
|
51 |
+
|
52 |
+
for idx, row in df.head(5000).iterrows():
|
53 |
+
print(f"Fetching details for {row['page_url']}...")
|
54 |
+
image_url, annotation = extract_book_details(row['page_url'])
|
55 |
+
df.at[idx, 'image_url'] = image_url
|
56 |
+
df.at[idx, 'annotation'] = annotation
|
57 |
+
|
58 |
+
|
59 |
+
df.to_csv('books_data_with_details.csv', index=False)
|
pages/recommendations.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import streamlit as st
|
4 |
+
import requests
|
5 |
+
from sentence_transformers import util
|
6 |
+
from sentence_transformers import SentenceTransformer, util
|
7 |
+
st.set_page_config(page_title="Custom Button Example", layout="wide")
|
8 |
+
|
9 |
+
@st.cache_resource
|
10 |
+
def load_model_all_mpnet():
|
11 |
+
return SentenceTransformer('all-mpnet-base-v2')
|
12 |
+
model_mp = load_model_all_mpnet()
|
13 |
+
@st.cache_data
|
14 |
+
def load_embeddings(file_path):
|
15 |
+
return np.load(file_path)
|
16 |
+
book_embeddings_mp = load_embeddings('data/book_embeddings.npy')
|
17 |
+
@st.cache_data
|
18 |
+
def load_data(file_path):
|
19 |
+
return pd.read_csv(file_path)
|
20 |
+
df = load_data('data/books_data_cleaned.csv')
|
21 |
+
@st.cache_resource
|
22 |
+
def load_model_msmarco():
|
23 |
+
return SentenceTransformer('msmarco-roberta-base-v3')
|
24 |
+
model_ms = load_model_msmarco()
|
25 |
+
@st.cache_data
|
26 |
+
def load_embeddings(file_path):
|
27 |
+
return np.load(file_path)
|
28 |
+
book_embeddings_ms = load_embeddings('data/book_embeddings_ms.npy')
|
29 |
+
|
30 |
+
|
31 |
+
def get_embedding(text, model):
|
32 |
+
text = model.encode(text, convert_to_tensor=True)
|
33 |
+
return text
|
34 |
+
|
35 |
+
|
36 |
+
def get_top_10_recommendations(query, model, book_embeddings, top_k):
|
37 |
+
query_embedding = get_embedding(query, model).cpu()
|
38 |
+
similarities = util.pytorch_cos_sim(query_embedding, book_embeddings)[0]
|
39 |
+
top_results = similarities.cpu().numpy().argsort()[::-1][:top_k]
|
40 |
+
top_books = df.iloc[top_results].copy()
|
41 |
+
similarity_scores = similarities.cpu().numpy()[top_results]
|
42 |
+
top_books['similarity_score'] = similarity_scores
|
43 |
+
return top_books
|
44 |
+
|
45 |
+
|
46 |
+
st.title('Рекомендации книг')
|
47 |
+
|
48 |
+
search = st.radio(
|
49 |
+
"Выберите тип семантического поиска:",
|
50 |
+
[":blue[Симметричный]", ":blue[Асимметричный]"],
|
51 |
+
captions=[
|
52 |
+
"Используем 'all-mpnet-base-v2'",
|
53 |
+
"Используем 'msmarco-roberta-base-v3'",
|
54 |
+
],
|
55 |
+
)
|
56 |
+
|
57 |
+
def params(search):
|
58 |
+
if search == ":blue[Симметричный]":
|
59 |
+
text = '''Я ищу книги в жанре фэнтези, которые описывают приключения магов и волшебников, обучающихся в специальных магических школах и сражающихся с темными силами или злыми существами. Особенно интересуют произведения, где главные герои сталкиваются с эпическими испытаниями и развивают свои уникальные способности.'''
|
60 |
+
model = model_mp
|
61 |
+
book_embeddings = book_embeddings_mp
|
62 |
+
return text, model, book_embeddings
|
63 |
+
elif search == ":blue[Асимметричный]":
|
64 |
+
text = '''путешествие во времени'''
|
65 |
+
model = model_ms
|
66 |
+
book_embeddings = book_embeddings_ms
|
67 |
+
return text, model, book_embeddings
|
68 |
+
text, model, book_embeddings = params(search)
|
69 |
+
|
70 |
+
|
71 |
+
col1, col2 = st.columns([3, 1])
|
72 |
+
with col1:
|
73 |
+
query = st.text_area('Введите запрос, чтобы получить рекомендации', f'{text}', height=95)
|
74 |
+
with col2:
|
75 |
+
number = st.number_input(
|
76 |
+
"Сколько книг найти?", value=10
|
77 |
+
)
|
78 |
+
find_button = st.button('Найти', key='find_button', use_container_width=True)
|
79 |
+
|
80 |
+
if find_button and query:
|
81 |
+
top_10_books = get_top_10_recommendations(query, model, book_embeddings, number)
|
82 |
+
for idx, row in top_10_books.iterrows():
|
83 |
+
with st.container():
|
84 |
+
col1, col2 = st.columns([1, 3])
|
85 |
+
|
86 |
+
with col1:
|
87 |
+
st.image(row['image_url'], width = 300)
|
88 |
+
with col2:
|
89 |
+
st.subheader(f"{row['title']}")
|
90 |
+
st.write(f"**Автор:** {row['author']}")
|
91 |
+
st.write("---")
|
92 |
+
st.write(row['annotation'])
|
93 |
+
st.metric(label="Схожесть", value=f"{row['similarity_score']:.3f}")
|
94 |
+
st.write(f"**Ссылка:** {row['page_url']}")
|
95 |
+
st.write("---")
|
96 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
beautifulsoup4==4.12.3
|
2 |
+
pandas==2.2.2
|
3 |
+
Requests==2.32.3
|
4 |
+
streamlit==1.37.0
|
5 |
+
sentence-transformers==3.0.1
|