File size: 3,301 Bytes
faf4817
 
 
 
 
 
b8d0a69
faf4817
 
 
 
b8d0a69
faf4817
 
 
 
b8d0a69
 
faf4817
b8d0a69
faf4817
b8d0a69
 
 
 
 
ebdb067
b8d0a69
 
 
faf4817
 
 
 
 
 
 
 
 
 
942def5
faf4817
942def5
 
 
ae3716f
942def5
b8d0a69
faf4817
 
 
b8d0a69
 
faf4817
b8d0a69
 
 
 
 
 
 
 
 
 
 
faf4817
 
 
 
 
 
 
 
 
b8d0a69
faf4817
 
 
 
 
 
 
 
 
b8d0a69
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import streamlit as st
import pandas as pd 
import torch
from PIL import Image
from io import BytesIO
import requests
import faiss


from transformers import AutoTokenizer, AutoModel
import numpy as np
st.set_page_config(layout="wide")

@st.cache_resource()
def load_model():
    model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
    tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
    return model , tokenizer

model, tokenizer = load_model()

@st.cache_data()
def load_data():
    df = pd.read_csv('Dataset/parcedbooks.csv')
    with open('Dataset/embeddingsbooks.txt', 'r') as file:
        embeddings_list = [list(map(float, line.split())) for line in file.readlines()]
    index = faiss.read_index('Dataset/faissbooks.index')
    return df, embeddings_list, index

df, embeddings_list, index = load_data()

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()


col3, col4 = st.columns([5,1])

with col3:
    text = st.text_input('Введите ваше предпочтение для рекомендации')
with col4:
    num = st.number_input('Количество книг', step=1, value=1)
    button = st.button('Отправить запрос')


if text and button:
    decode_text = embed_bert_cls(text, model, tokenizer)  # Получение вектора для введенного текста
    k = num 
    D, I = index.search(decode_text.reshape(1, -1), k)
    
    top_similar_indices = I[0]
    top_similar_annotations = [df['annotation'].iloc[i] for i in top_similar_indices]
    top_similar_images = [df['image_url'].iloc[i] for i in top_similar_indices]
    images = [Image.open(BytesIO(requests.get(url).content)) for url in top_similar_images]
    top_similar_authors = [df['author'].iloc[i] for i in top_similar_indices]
    top_similar_title = [df['title'].iloc[i] for i in top_similar_indices]
    top_similar_url = [df['page_url'].iloc[i] for i in top_similar_indices]
    top_cosine_similarities = [1 - d / 2 for d in D[0]]  # Преобразование расстояний в косинусное сходство

# Отображение изображений и названий
    for similarity, image, author, annotation, title, url in zip(top_cosine_similarities, images, top_similar_authors, top_similar_annotations, top_similar_title, top_similar_url):
        col1, col2 = st.columns([3, 4]) 
        with col1:
            st.image(image, width=300)
        with col2:
            st.write(f"***Автор:*** {author}")
            st.write(f"***Название:*** {title}")
            st.write(f"***Аннотация:*** {annotation}")
            similarity = float(similarity)
            st.write(f"***Cosine Similarity : {round(similarity, 3)}***")
            st.write(f"***Ссылка на книгу : {url}***")

        st.markdown(
        "<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
        unsafe_allow_html=True
    )