Spaces:
Sleeping
Sleeping
Update to v2 of book rater
Browse files
app.py
CHANGED
@@ -1,79 +1,135 @@
|
|
1 |
import pickle
|
2 |
import streamlit as st
|
3 |
import numpy as np
|
|
|
4 |
|
5 |
-
st.header("Book
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
final_ratings = pickle.load(open("artifacts/final_ratings.pkl", "rb"))
|
10 |
-
book_pivot = pickle.load(open("artifacts/book_pivot.pkl", "rb"))
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
idsIndex = []
|
15 |
-
posterUrl = []
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
bookNames.append(book_pivot.index[bookId])
|
21 |
-
|
22 |
-
for name in bookNames:
|
23 |
-
ids = np.where(final_ratings['title'] == name)[0][0]
|
24 |
-
idsIndex.append(ids)
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
url = row['img_url']
|
29 |
-
posterUrl.append(url)
|
30 |
|
31 |
-
|
|
|
32 |
|
33 |
-
def
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
|
|
|
37 |
|
38 |
-
|
|
|
|
|
39 |
|
40 |
-
|
41 |
|
42 |
-
|
43 |
-
|
44 |
|
45 |
-
|
46 |
-
bookList.append(j)
|
47 |
|
48 |
-
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
if st.button("Show Recommendations"):
|
56 |
-
recommendations, posterUrls = recommend_book(selected_books)
|
57 |
-
|
58 |
st.subheader("Recommendations")
|
59 |
|
60 |
-
|
|
|
|
|
61 |
|
62 |
-
|
63 |
-
print(url)
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
-
|
70 |
-
st.
|
71 |
-
st.image(posterUrls[2])
|
72 |
|
73 |
-
|
74 |
-
st.text(recommendations[3])
|
75 |
-
st.image(posterUrls[3])
|
76 |
|
77 |
-
|
78 |
-
st.text(recommendations[4])
|
79 |
-
st.image(posterUrls[4])
|
|
|
1 |
import pickle
|
2 |
import streamlit as st
|
3 |
import numpy as np
|
4 |
+
import bookdb
|
5 |
|
6 |
+
st.header("My Book Buddy 🐛")
|
7 |
|
8 |
+
if "upvoted_book_ids" not in st.session_state:
|
9 |
+
st.session_state["upvoted_book_ids"] = []
|
|
|
|
|
10 |
|
11 |
+
if "downvoted_book_ids" not in st.session_state:
|
12 |
+
st.session_state["downvoted_book_ids"] = []
|
|
|
|
|
13 |
|
14 |
+
if "recommendedBooksData" not in st.session_state:
|
15 |
+
st.session_state["recommendedBooksData"] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
+
if "topCorrelatedReadersData" not in st.session_state:
|
18 |
+
st.session_state["topCorrelatedReadersData"] = None
|
|
|
|
|
19 |
|
20 |
+
if "numSimilarUsers" not in st.session_state:
|
21 |
+
st.session_state["numSimilarUsers"] = 0
|
22 |
|
23 |
+
def update_display(displayData):
|
24 |
+
st.session_state["upvoted_book_ids"] = displayData["upvotedBookIds"]
|
25 |
+
st.session_state["downvoted_book_ids"] = displayData["downvotedBookIds"]
|
26 |
+
|
27 |
+
st.session_state["numSimilarUsers"] = displayData["numSimilarUsers"]
|
28 |
+
st.session_state["recommendedBooksData"] = displayData["recommendedBooksData"]
|
29 |
+
st.session_state["topCorrelatedReadersData"] = displayData["topCorrelatedReadersData"]
|
30 |
+
|
31 |
+
def on_reset_votes():
|
32 |
+
st.session_state["upvoted_book_ids"] = []
|
33 |
+
st.session_state["downvoted_book_ids"] = []
|
34 |
+
st.session_state["numSimilarUsers"] = 0
|
35 |
+
st.session_state["recommendedBooksData"] = None
|
36 |
+
st.session_state["topCorrelatedReadersData"] = None
|
37 |
+
|
38 |
+
def on_submit_votes():
|
39 |
+
upvoteBookTitles = st.session_state["multiselect_upvote"]
|
40 |
+
downvoteBookTitles = st.session_state["multiselect_downvote"]
|
41 |
+
|
42 |
+
if len(upvoteBookTitles) == 0 and len(downvoteBookTitles) == 0:
|
43 |
+
st.warning("Please select at least one book to upvote or downvote")
|
44 |
+
return
|
45 |
+
|
46 |
+
booksToUpvote = bookdb.get_book_ids_by_title(upvoteBookTitles)
|
47 |
+
booksToDownvote = bookdb.get_book_ids_by_title(downvoteBookTitles)
|
48 |
+
|
49 |
+
# get the currently voted book ids
|
50 |
+
upvotedBookIds = st.session_state["upvoted_book_ids"]
|
51 |
+
downvotedBookIds = st.session_state["downvoted_book_ids"]
|
52 |
+
|
53 |
+
if (len(upvotedBookIds) + len(booksToUpvote)) == 0 or (len(downvotedBookIds) + len(booksToDownvote)) == 0:
|
54 |
+
st.warning("You must select at least one book to upvote and one book to downvote")
|
55 |
+
return
|
56 |
+
|
57 |
+
# append booksToUpvote to upvotedBookIds
|
58 |
+
upvotedBookIds.extend(booksToUpvote)
|
59 |
+
# append booksToDownvote to downvotedBookIds
|
60 |
+
downvotedBookIds.extend(booksToDownvote)
|
61 |
|
62 |
+
# remove any upvoted books from downvotedBookIds if they are in there
|
63 |
+
downvotedBookIds = [x for x in downvotedBookIds if x not in upvotedBookIds]
|
64 |
|
65 |
+
# clear the multiselects
|
66 |
+
st.session_state.multiselect_upvote = []
|
67 |
+
st.session_state.multiselect_downvote = []
|
68 |
|
69 |
+
update_display(bookdb.update_user_ratings(upvotedBookIds, downvotedBookIds))
|
70 |
|
71 |
+
with st.form(key='upvote_form'):
|
72 |
+
col1, col2 = st.columns(2)
|
73 |
|
74 |
+
allBookTitles = bookdb.get_all_book_titles()
|
|
|
75 |
|
76 |
+
myRatedBookTitles = bookdb.get_book_titles(st.session_state["upvoted_book_ids"] + st.session_state["downvoted_book_ids"])
|
77 |
|
78 |
+
# remove myRatedBookTitles from allBookTitles
|
79 |
+
remainingBookTitles = [x for x in allBookTitles if x not in myRatedBookTitles]
|
80 |
+
|
81 |
+
col1.multiselect(
|
82 |
+
'Upvote Books 👍',
|
83 |
+
remainingBookTitles,
|
84 |
+
key='multiselect_upvote'
|
85 |
+
)
|
86 |
+
col2.multiselect(
|
87 |
+
'Downvote Books 👎',
|
88 |
+
remainingBookTitles,
|
89 |
+
key='multiselect_downvote'
|
90 |
+
)
|
91 |
+
st.form_submit_button(label='Submit', type="primary", on_click=on_submit_votes)
|
92 |
+
|
93 |
+
if st.session_state["recommendedBooksData"] is not None:
|
94 |
+
df = st.session_state["recommendedBooksData"]
|
95 |
|
|
|
|
|
|
|
96 |
st.subheader("Recommendations")
|
97 |
|
98 |
+
st.dataframe(df, hide_index=True, use_container_width=True)
|
99 |
+
|
100 |
+
st.button('Reset All Ratings', type="secondary", on_click=on_reset_votes)
|
101 |
|
102 |
+
st.subheader("Your Ratings")
|
|
|
103 |
|
104 |
+
displayCol1, displayCol2 = st.columns(2)
|
105 |
+
|
106 |
+
if len(st.session_state["upvoted_book_ids"]) > 0:
|
107 |
+
upvotedBookIds = st.session_state["upvoted_book_ids"]
|
108 |
+
|
109 |
+
displayCol1.markdown(f"Upvoted: {len(upvotedBookIds)} book(s)")
|
110 |
+
|
111 |
+
for bookId in upvotedBookIds:
|
112 |
+
displayCol1.markdown(f' - {bookId}-{bookdb.get_book_title(bookId)}')
|
113 |
+
|
114 |
+
if len(st.session_state["downvoted_book_ids"]) > 0:
|
115 |
+
downvotedBookIds = st.session_state["downvoted_book_ids"]
|
116 |
+
|
117 |
+
displayCol2.markdown(f'Downvoted: {len(downvotedBookIds)} book(s)')
|
118 |
+
|
119 |
+
for bookId in downvotedBookIds:
|
120 |
+
displayCol2.markdown(f' - {bookId}-{bookdb.get_book_title(bookId)}')
|
121 |
+
|
122 |
+
st.divider()
|
123 |
+
|
124 |
+
# st.write(f"Similar User Min Percent Shared Books = {round(bookdb.SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS * 100)}%")
|
125 |
+
# st.write(f"Similar User Min Correlation = {bookdb.SIMILAR_USER_MIN_CORRELATION}")
|
126 |
+
|
127 |
+
# if "numSimilarUsers" in st.session_state:
|
128 |
+
# st.write(f"{st.session_state['numSimilarUsers']} similar users")
|
129 |
|
130 |
+
if st.session_state["topCorrelatedReadersData"] is not None:
|
131 |
+
df = st.session_state["topCorrelatedReadersData"]
|
|
|
132 |
|
133 |
+
st.subheader("Top Correlated Readers")
|
|
|
|
|
134 |
|
135 |
+
st.dataframe(df, use_container_width=True)
|
|
|
|
bookdb.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS = 0.30
|
4 |
+
SIMILAR_USER_MIN_CORRELATION = 0.25
|
5 |
+
|
6 |
+
books = pd.read_csv("./goodreads/books.csv",
|
7 |
+
usecols=["book_id",
|
8 |
+
# "original_publication_year",
|
9 |
+
# "average_rating",
|
10 |
+
"title",
|
11 |
+
"average_rating"])
|
12 |
+
books['book_id'] = range(1, len(books) + 1)
|
13 |
+
|
14 |
+
baseRatings = pd.read_csv("./goodreads/ratings.csv")
|
15 |
+
|
16 |
+
def get_book_ids_by_title(book_titles):
|
17 |
+
return books[books["title"].isin(book_titles)]["book_id"].values
|
18 |
+
|
19 |
+
def get_all_book_titles():
|
20 |
+
return books["title"].values
|
21 |
+
|
22 |
+
def get_book_title(book_id):
|
23 |
+
return books[books["book_id"] == book_id]["title"].values[0]
|
24 |
+
|
25 |
+
def get_book_titles(book_ids):
|
26 |
+
return books[books["book_id"].isin(book_ids)]["title"].values
|
27 |
+
|
28 |
+
def update_user_ratings(upvotedBookIds, downvotedBookIds):
|
29 |
+
# upvotedBookIds = [104, 103, 102, 110, 113, 124, 129, 135, 141, 142, 155, 161, 165, 176, 181, 974, 4443, 1496, 1003, 974, 2600] # TODO REMOVE
|
30 |
+
# downvotedBookIds = [126, 179, 183, 184, 187, 9076, 960, 5895, 777, 6902, 2084, 584] # TODO REMOVE
|
31 |
+
|
32 |
+
# get the max user id in baseRatings
|
33 |
+
newUserId = baseRatings['user_id'].max() + 1
|
34 |
+
|
35 |
+
RATING_FOR_UPVOTE = 5
|
36 |
+
RATING_FOR_DOWNVOTE = 1
|
37 |
+
|
38 |
+
appendBookIds = []
|
39 |
+
appendBookRatings = []
|
40 |
+
appendUserIds = []
|
41 |
+
|
42 |
+
for bookId in upvotedBookIds:
|
43 |
+
appendBookIds.append(bookId)
|
44 |
+
appendBookRatings.append(RATING_FOR_UPVOTE)
|
45 |
+
appendUserIds.append(newUserId)
|
46 |
+
|
47 |
+
for bookId in downvotedBookIds:
|
48 |
+
appendBookIds.append(bookId)
|
49 |
+
appendBookRatings.append(RATING_FOR_DOWNVOTE)
|
50 |
+
appendUserIds.append(newUserId)
|
51 |
+
|
52 |
+
newUserData = {
|
53 |
+
'book_id': appendBookIds,
|
54 |
+
'user_id': appendUserIds,
|
55 |
+
'rating': appendBookRatings
|
56 |
+
}
|
57 |
+
|
58 |
+
newRows = pd.DataFrame(newUserData)
|
59 |
+
|
60 |
+
ratings = pd.concat([baseRatings, newRows], ignore_index=True)
|
61 |
+
|
62 |
+
df = pd.merge(books, ratings, on="book_id", how="inner")
|
63 |
+
|
64 |
+
user_df = df.groupby(["user_id","title"])["rating"].mean().unstack()
|
65 |
+
|
66 |
+
targetUserDf = user_df[user_df.index == newUserId]
|
67 |
+
|
68 |
+
targetBooksRead = targetUserDf.dropna(axis=1).columns.tolist()
|
69 |
+
|
70 |
+
# for all the books the user has read
|
71 |
+
book_read_df = user_df[targetBooksRead]
|
72 |
+
|
73 |
+
# get counts of every *other* user that has read these
|
74 |
+
userBookCount = book_read_df.notnull().sum(axis=1)
|
75 |
+
|
76 |
+
print('\n\n\n\n\n\n\n\n')
|
77 |
+
print('--' * 10)
|
78 |
+
print(userBookCount)
|
79 |
+
|
80 |
+
# from there get users who've read at least X percent of the main user
|
81 |
+
minBookCount = book_read_df.shape[1] * SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS
|
82 |
+
|
83 |
+
# print(minBookCount)
|
84 |
+
|
85 |
+
usersSameBooks = userBookCount[userBookCount > minBookCount].index
|
86 |
+
|
87 |
+
# print(f'Users who have read at least 30% of the main user: {len(usersSameBooks)}')
|
88 |
+
|
89 |
+
# filter the main user's read books df to only include rows from users who've read X percent
|
90 |
+
filted_df = book_read_df[book_read_df.index.isin(usersSameBooks)]
|
91 |
+
|
92 |
+
# convert all NaNs to 0
|
93 |
+
filted_df = filted_df.fillna(0)
|
94 |
+
|
95 |
+
corr_df = filted_df.T.corr().unstack()
|
96 |
+
|
97 |
+
top_readers = pd.DataFrame(corr_df[newUserId][corr_df[newUserId] > SIMILAR_USER_MIN_CORRELATION], columns=["corr"])
|
98 |
+
|
99 |
+
print(top_readers)
|
100 |
+
|
101 |
+
if (newUserId in top_readers.index):
|
102 |
+
top_readers = top_readers.drop(newUserId)
|
103 |
+
|
104 |
+
# get the ratings for the top readers
|
105 |
+
top_readers_ratings = pd.merge(top_readers, df[["user_id", "book_id", "rating"]], how='inner', on="user_id")
|
106 |
+
|
107 |
+
# weight their ratings by how correlated they are with the user
|
108 |
+
top_readers_ratings['weighted_rating'] = top_readers_ratings['corr'] * top_readers_ratings['rating']
|
109 |
+
|
110 |
+
# # make a pivot table from the books and their new weighted rating
|
111 |
+
recommendation_df = top_readers_ratings.pivot_table(values="weighted_rating", index="book_id", aggfunc="mean")
|
112 |
+
|
113 |
+
# set all books in upvotedBookIds/downvotedBookIds to 0 in the recommendation_df if they are in there
|
114 |
+
recommendation_df.loc[recommendation_df.index.isin(upvotedBookIds)] = 0
|
115 |
+
recommendation_df.loc[recommendation_df.index.isin(downvotedBookIds)] = 0
|
116 |
+
|
117 |
+
# sort the books by their weighted rating
|
118 |
+
books_recommend = recommendation_df[recommendation_df["weighted_rating"] > 1].sort_values(by="weighted_rating", ascending=False).head(20)
|
119 |
+
|
120 |
+
# get the recommended books (and sort by average_rating)
|
121 |
+
recommendedBooks = books[books["book_id"].isin(books_recommend.index)].sort_values(by="average_rating", ascending=False)
|
122 |
+
# drop book_id column
|
123 |
+
recommendedBooks = recommendedBooks.drop(columns=["book_id"])
|
124 |
+
|
125 |
+
return {
|
126 |
+
"upvotedBookIds": upvotedBookIds,
|
127 |
+
"downvotedBookIds": downvotedBookIds,
|
128 |
+
"numSimilarUsers" : len(usersSameBooks),
|
129 |
+
"recommendedBooksData": recommendedBooks,
|
130 |
+
# sort by correlation
|
131 |
+
"topCorrelatedReadersData" : top_readers.sort_values(by="corr", ascending=False)
|
132 |
+
}
|