PolyakovK commited on
Commit
a23f3fb
·
1 Parent(s): ab4df41
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+
5
+ st.title('Рекомендация книг на основе пользовательского промта')
6
+ st.subheader('Основная информация')
7
+
8
+ def load_data(file_path):
9
+ return pd.read_csv(file_path)
10
+ df = load_data('data/books_data_cleaned.csv')
11
+
12
+ data = {
13
+ 'Источник': ['chitai-gorod.ru'],
14
+ 'Кол-во книг': ['5000'],
15
+ 'Уникальных авторов': ['2112'],
16
+ 'Жанр': ['Художественная лит-ра'],
17
+ 'Время парсинга': ['77 минут'],
18
+ }
19
+ df1 = pd.DataFrame(data)
20
+ st.dataframe(df1)
21
+ st.subheader('Используемые модели:')
22
+
23
+ data1 = {
24
+ 'sentence-transformers': ['<a href="https://huggingface.co/sentence-transformers/all-mpnet-base-v2" target="_blank">all-mpnet-base-v2</a>'],
25
+ 'Pre-Trained MS MARCO Models': ['<a href="https://huggingface.co/sentence-transformers/msmarco-roberta-base-v3" target="_blank">msmarco-roberta-base-v3</a>'],
26
+ }
27
+ df2 = pd.DataFrame(data1)
28
+ st.markdown(df2.to_html(escape=False), unsafe_allow_html=True)
29
+
30
+
31
+ def show_random_books():
32
+ sample_df = df.sample(n=10)
33
+ num_books = len(sample_df)
34
+ num_rows = (num_books + 1) // 2
35
+
36
+ for i in range(num_rows):
37
+ cols = st.columns(4) # Создаем 4 колонки
38
+ for j in range(2):
39
+ index = i * 2 + j
40
+ if index < num_books:
41
+ row = sample_df.iloc[index]
42
+ if j == 0:
43
+ with cols[0]: # Первая колонка (обложка первой книги)
44
+ st.image(row['image_url'], width=200)
45
+ with cols[1]: # Вторая колонка (информация первой книги)
46
+ st.subheader(row['title'])
47
+ st.write(f"Автор: {row['author']}")
48
+ st.write(f"**Ссылка:** [книга]({row['page_url']})")
49
+ st.write("---")
50
+ elif j == 1:
51
+ with cols[2]: # Третья колонка (обложка второй книги)
52
+ st.image(row['image_url'], width=200)
53
+ with cols[3]: # Четвертая колонка (информация второй книги)
54
+ st.subheader(row['title'])
55
+ st.write(f"Автор: {row['author']}")
56
+ st.write(f"**Ссылка:** [книга]({row['page_url']})")
57
+
58
+
59
+ if st.button('Показать 10 случайных книг'):
60
+ show_random_books()
data/book_embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbb86f342a62c07926d39da48daab2d4481d7fcf4437eae254eaa42ef8ac0e91
3
+ size 15283328
data/book_embeddings_ms.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d90b7ba8398f552f593df4688485b51d0aea57e9712a480878f79799d17a59c9
3
+ size 15283328
data/books_data_cleaned.csv ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/parser.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+
5
+ df = pd.DataFrame(columns=['page_url', 'image_url', 'author', 'title', 'annotation'])
6
+
7
+ def extract_data_from_page(page_number):
8
+ url = f'https://www.chitai-gorod.ru/catalog/books/hudozhestvennaya-literatura-110001?page={page_number}'
9
+ response = requests.get(url)
10
+ soup = BeautifulSoup(response.content, 'html.parser')
11
+ books = soup.find_all('article', class_='product-card')
12
+
13
+ data = []
14
+ for book in books:
15
+ try:
16
+ book_url = book.find('a', class_='product-card__picture')['href']
17
+ title = book.find('div', class_='product-title__head').get_text(strip=True)
18
+ author = book.find('div', class_='product-title__author').get_text(strip=True)
19
+ absolute_url = f'https://www.chitai-gorod.ru{book_url}'
20
+
21
+ data.append({'page_url': absolute_url, 'title': title, 'author': author})
22
+ except Exception as e:
23
+ print(f"Error processing book: {e}")
24
+
25
+ return data
26
+
27
+
28
+ for page in range(2, 201):
29
+ print(f"Processing page {page}...")
30
+ page_data = extract_data_from_page(page)
31
+ df = pd.concat([df, pd.DataFrame(page_data)], ignore_index=True)
32
+ if len(df) >= 5000:
33
+ break
34
+
35
+ df = df.head(5000)
36
+
37
+
38
+ def extract_book_details(book_url):
39
+ try:
40
+ response = requests.get(book_url)
41
+ soup = BeautifulSoup(response.content, 'html.parser')
42
+ image_tag = soup.find('meta', {'name': 'og:image'})
43
+ image_url = image_tag['content'] if image_tag else None
44
+ annotation_tag = soup.find('div', {'itemprop': 'description'})
45
+ annotation = annotation_tag.get_text(strip=True) if annotation_tag else None
46
+
47
+ return image_url, annotation
48
+ except Exception as e:
49
+ print(f"Error extracting details from {book_url}: {e}")
50
+ return None, None
51
+
52
+ for idx, row in df.head(5000).iterrows():
53
+ print(f"Fetching details for {row['page_url']}...")
54
+ image_url, annotation = extract_book_details(row['page_url'])
55
+ df.at[idx, 'image_url'] = image_url
56
+ df.at[idx, 'annotation'] = annotation
57
+
58
+
59
+ df.to_csv('books_data_with_details.csv', index=False)
pages/recommendations.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import streamlit as st
4
+ import requests
5
+ from sentence_transformers import util
6
+ from sentence_transformers import SentenceTransformer, util
7
+ st.set_page_config(page_title="Custom Button Example", layout="wide")
8
+
9
+ @st.cache_resource
10
+ def load_model_all_mpnet():
11
+ return SentenceTransformer('all-mpnet-base-v2')
12
+ model_mp = load_model_all_mpnet()
13
+ @st.cache_data
14
+ def load_embeddings(file_path):
15
+ return np.load(file_path)
16
+ book_embeddings_mp = load_embeddings('data/book_embeddings.npy')
17
+ @st.cache_data
18
+ def load_data(file_path):
19
+ return pd.read_csv(file_path)
20
+ df = load_data('data/books_data_cleaned.csv')
21
+ @st.cache_resource
22
+ def load_model_msmarco():
23
+ return SentenceTransformer('msmarco-roberta-base-v3')
24
+ model_ms = load_model_msmarco()
25
+ @st.cache_data
26
+ def load_embeddings(file_path):
27
+ return np.load(file_path)
28
+ book_embeddings_ms = load_embeddings('data/book_embeddings_ms.npy')
29
+
30
+
31
+ def get_embedding(text, model):
32
+ text = model.encode(text, convert_to_tensor=True)
33
+ return text
34
+
35
+
36
+ def get_top_10_recommendations(query, model, book_embeddings, top_k):
37
+ query_embedding = get_embedding(query, model).cpu()
38
+ similarities = util.pytorch_cos_sim(query_embedding, book_embeddings)[0]
39
+ top_results = similarities.cpu().numpy().argsort()[::-1][:top_k]
40
+ top_books = df.iloc[top_results].copy()
41
+ similarity_scores = similarities.cpu().numpy()[top_results]
42
+ top_books['similarity_score'] = similarity_scores
43
+ return top_books
44
+
45
+
46
+ st.title('Рекомендации книг')
47
+
48
+ search = st.radio(
49
+ "Выберите тип семантического поиска:",
50
+ [":blue[Симметричный]", ":blue[Асимметричный]"],
51
+ captions=[
52
+ "Используем 'all-mpnet-base-v2'",
53
+ "Используем 'msmarco-roberta-base-v3'",
54
+ ],
55
+ )
56
+
57
+ def params(search):
58
+ if search == ":blue[Симметричный]":
59
+ text = '''Я ищу книги в жанре фэнтези, которые описывают приключения магов и волшебников, обучающихся в специальных магических школах и сражающихся с темными силами или злыми существами. Особенно интересуют произведения, где главные герои сталкиваются с эпическими испытаниями и развивают свои уникальные способности.'''
60
+ model = model_mp
61
+ book_embeddings = book_embeddings_mp
62
+ return text, model, book_embeddings
63
+ elif search == ":blue[Асимметричный]":
64
+ text = '''путешествие во времени'''
65
+ model = model_ms
66
+ book_embeddings = book_embeddings_ms
67
+ return text, model, book_embeddings
68
+ text, model, book_embeddings = params(search)
69
+
70
+
71
+ col1, col2 = st.columns([3, 1])
72
+ with col1:
73
+ query = st.text_area('Введите запрос, чтобы получить рекомендации', f'{text}', height=95)
74
+ with col2:
75
+ number = st.number_input(
76
+ "Сколько книг найти?", value=10
77
+ )
78
+ find_button = st.button('Найти', key='find_button', use_container_width=True)
79
+
80
+ if find_button and query:
81
+ top_10_books = get_top_10_recommendations(query, model, book_embeddings, number)
82
+ for idx, row in top_10_books.iterrows():
83
+ with st.container():
84
+ col1, col2 = st.columns([1, 3])
85
+
86
+ with col1:
87
+ st.image(row['image_url'], width = 300)
88
+ with col2:
89
+ st.subheader(f"{row['title']}")
90
+ st.write(f"**Автор:** {row['author']}")
91
+ st.write("---")
92
+ st.write(row['annotation'])
93
+ st.metric(label="Схожесть", value=f"{row['similarity_score']:.3f}")
94
+ st.write(f"**Ссылка:** {row['page_url']}")
95
+ st.write("---")
96
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ beautifulsoup4==4.12.3
2
+ pandas==2.2.2
3
+ Requests==2.32.3
4
+ streamlit==1.37.0
5
+ sentence-transformers==3.0.1