conlan commited on
Commit
8189b4d
·
1 Parent(s): 6c5993d

Update to v2 of book rater

Browse files
Files changed (2) hide show
  1. app.py +110 -54
  2. bookdb.py +132 -0
app.py CHANGED
@@ -1,79 +1,135 @@
1
  import pickle
2
  import streamlit as st
3
  import numpy as np
 
4
 
5
- st.header("Book Recommender System")
6
 
7
- model = pickle.load(open("artifacts/model.pkl", "rb"))
8
- book_names = pickle.load(open("artifacts/book_names.pkl", "rb"))
9
- final_ratings = pickle.load(open("artifacts/final_ratings.pkl", "rb"))
10
- book_pivot = pickle.load(open("artifacts/book_pivot.pkl", "rb"))
11
 
12
- def fetch_poster(suggestion):
13
- bookNames = []
14
- idsIndex = []
15
- posterUrl = []
16
 
17
- for bookId in suggestion[0]:
18
- name = book_pivot.index[bookId]
19
-
20
- bookNames.append(book_pivot.index[bookId])
21
-
22
- for name in bookNames:
23
- ids = np.where(final_ratings['title'] == name)[0][0]
24
- idsIndex.append(ids)
25
 
26
- for idx in idsIndex:
27
- row = final_ratings.iloc[idx]
28
- url = row['img_url']
29
- posterUrl.append(url)
30
 
31
- return posterUrl
 
32
 
33
- def recommend_book(bookName):
34
- bookList = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- book_id = np.where(book_pivot.index == bookName)[0][0]
 
37
 
38
- distance, suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1, -1), n_neighbors=5)
 
 
39
 
40
- poster_url = fetch_poster(suggestion)
41
 
42
- for i in range(len(suggestion)):
43
- books = book_pivot.index[suggestion[i]]
44
 
45
- for j in books:
46
- bookList.append(j)
47
 
48
- return bookList, poster_url
49
 
50
- selected_books = st.selectbox(
51
- "Select a book",
52
- book_names
53
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- if st.button("Show Recommendations"):
56
- recommendations, posterUrls = recommend_book(selected_books)
57
-
58
  st.subheader("Recommendations")
59
 
60
- col1, col2, col3, col4 = st.columns(4)
 
 
61
 
62
- for url in posterUrls:
63
- print(url)
64
 
65
- with col1:
66
- st.text(recommendations[1])
67
- st.image(posterUrls[1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- with col2:
70
- st.text(recommendations[2])
71
- st.image(posterUrls[2])
72
 
73
- with col3:
74
- st.text(recommendations[3])
75
- st.image(posterUrls[3])
76
 
77
- with col4:
78
- st.text(recommendations[4])
79
- st.image(posterUrls[4])
 
1
  import pickle
2
  import streamlit as st
3
  import numpy as np
4
+ import bookdb
5
 
6
+ st.header("My Book Buddy 🐛")
7
 
8
+ if "upvoted_book_ids" not in st.session_state:
9
+ st.session_state["upvoted_book_ids"] = []
 
 
10
 
11
+ if "downvoted_book_ids" not in st.session_state:
12
+ st.session_state["downvoted_book_ids"] = []
 
 
13
 
14
+ if "recommendedBooksData" not in st.session_state:
15
+ st.session_state["recommendedBooksData"] = None
 
 
 
 
 
 
16
 
17
+ if "topCorrelatedReadersData" not in st.session_state:
18
+ st.session_state["topCorrelatedReadersData"] = None
 
 
19
 
20
+ if "numSimilarUsers" not in st.session_state:
21
+ st.session_state["numSimilarUsers"] = 0
22
 
23
+ def update_display(displayData):
24
+ st.session_state["upvoted_book_ids"] = displayData["upvotedBookIds"]
25
+ st.session_state["downvoted_book_ids"] = displayData["downvotedBookIds"]
26
+
27
+ st.session_state["numSimilarUsers"] = displayData["numSimilarUsers"]
28
+ st.session_state["recommendedBooksData"] = displayData["recommendedBooksData"]
29
+ st.session_state["topCorrelatedReadersData"] = displayData["topCorrelatedReadersData"]
30
+
31
+ def on_reset_votes():
32
+ st.session_state["upvoted_book_ids"] = []
33
+ st.session_state["downvoted_book_ids"] = []
34
+ st.session_state["numSimilarUsers"] = 0
35
+ st.session_state["recommendedBooksData"] = None
36
+ st.session_state["topCorrelatedReadersData"] = None
37
+
38
+ def on_submit_votes():
39
+ upvoteBookTitles = st.session_state["multiselect_upvote"]
40
+ downvoteBookTitles = st.session_state["multiselect_downvote"]
41
+
42
+ if len(upvoteBookTitles) == 0 and len(downvoteBookTitles) == 0:
43
+ st.warning("Please select at least one book to upvote or downvote")
44
+ return
45
+
46
+ booksToUpvote = bookdb.get_book_ids_by_title(upvoteBookTitles)
47
+ booksToDownvote = bookdb.get_book_ids_by_title(downvoteBookTitles)
48
+
49
+ # get the currently voted book ids
50
+ upvotedBookIds = st.session_state["upvoted_book_ids"]
51
+ downvotedBookIds = st.session_state["downvoted_book_ids"]
52
+
53
+ if (len(upvotedBookIds) + len(booksToUpvote)) == 0 or (len(downvotedBookIds) + len(booksToDownvote)) == 0:
54
+ st.warning("You must select at least one book to upvote and one book to downvote")
55
+ return
56
+
57
+ # append booksToUpvote to upvotedBookIds
58
+ upvotedBookIds.extend(booksToUpvote)
59
+ # append booksToDownvote to downvotedBookIds
60
+ downvotedBookIds.extend(booksToDownvote)
61
 
62
+ # remove any upvoted books from downvotedBookIds if they are in there
63
+ downvotedBookIds = [x for x in downvotedBookIds if x not in upvotedBookIds]
64
 
65
+ # clear the multiselects
66
+ st.session_state.multiselect_upvote = []
67
+ st.session_state.multiselect_downvote = []
68
 
69
+ update_display(bookdb.update_user_ratings(upvotedBookIds, downvotedBookIds))
70
 
71
+ with st.form(key='upvote_form'):
72
+ col1, col2 = st.columns(2)
73
 
74
+ allBookTitles = bookdb.get_all_book_titles()
 
75
 
76
+ myRatedBookTitles = bookdb.get_book_titles(st.session_state["upvoted_book_ids"] + st.session_state["downvoted_book_ids"])
77
 
78
+ # remove myRatedBookTitles from allBookTitles
79
+ remainingBookTitles = [x for x in allBookTitles if x not in myRatedBookTitles]
80
+
81
+ col1.multiselect(
82
+ 'Upvote Books 👍',
83
+ remainingBookTitles,
84
+ key='multiselect_upvote'
85
+ )
86
+ col2.multiselect(
87
+ 'Downvote Books 👎',
88
+ remainingBookTitles,
89
+ key='multiselect_downvote'
90
+ )
91
+ st.form_submit_button(label='Submit', type="primary", on_click=on_submit_votes)
92
+
93
+ if st.session_state["recommendedBooksData"] is not None:
94
+ df = st.session_state["recommendedBooksData"]
95
 
 
 
 
96
  st.subheader("Recommendations")
97
 
98
+ st.dataframe(df, hide_index=True, use_container_width=True)
99
+
100
+ st.button('Reset All Ratings', type="secondary", on_click=on_reset_votes)
101
 
102
+ st.subheader("Your Ratings")
 
103
 
104
+ displayCol1, displayCol2 = st.columns(2)
105
+
106
+ if len(st.session_state["upvoted_book_ids"]) > 0:
107
+ upvotedBookIds = st.session_state["upvoted_book_ids"]
108
+
109
+ displayCol1.markdown(f"Upvoted: {len(upvotedBookIds)} book(s)")
110
+
111
+ for bookId in upvotedBookIds:
112
+ displayCol1.markdown(f' - {bookId}-{bookdb.get_book_title(bookId)}')
113
+
114
+ if len(st.session_state["downvoted_book_ids"]) > 0:
115
+ downvotedBookIds = st.session_state["downvoted_book_ids"]
116
+
117
+ displayCol2.markdown(f'Downvoted: {len(downvotedBookIds)} book(s)')
118
+
119
+ for bookId in downvotedBookIds:
120
+ displayCol2.markdown(f' - {bookId}-{bookdb.get_book_title(bookId)}')
121
+
122
+ st.divider()
123
+
124
+ # st.write(f"Similar User Min Percent Shared Books = {round(bookdb.SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS * 100)}%")
125
+ # st.write(f"Similar User Min Correlation = {bookdb.SIMILAR_USER_MIN_CORRELATION}")
126
+
127
+ # if "numSimilarUsers" in st.session_state:
128
+ # st.write(f"{st.session_state['numSimilarUsers']} similar users")
129
 
130
+ if st.session_state["topCorrelatedReadersData"] is not None:
131
+ df = st.session_state["topCorrelatedReadersData"]
 
132
 
133
+ st.subheader("Top Correlated Readers")
 
 
134
 
135
+ st.dataframe(df, use_container_width=True)
 
 
bookdb.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS = 0.30
4
+ SIMILAR_USER_MIN_CORRELATION = 0.25
5
+
6
+ books = pd.read_csv("./goodreads/books.csv",
7
+ usecols=["book_id",
8
+ # "original_publication_year",
9
+ # "average_rating",
10
+ "title",
11
+ "average_rating"])
12
+ books['book_id'] = range(1, len(books) + 1)
13
+
14
+ baseRatings = pd.read_csv("./goodreads/ratings.csv")
15
+
16
+ def get_book_ids_by_title(book_titles):
17
+ return books[books["title"].isin(book_titles)]["book_id"].values
18
+
19
+ def get_all_book_titles():
20
+ return books["title"].values
21
+
22
+ def get_book_title(book_id):
23
+ return books[books["book_id"] == book_id]["title"].values[0]
24
+
25
+ def get_book_titles(book_ids):
26
+ return books[books["book_id"].isin(book_ids)]["title"].values
27
+
28
+ def update_user_ratings(upvotedBookIds, downvotedBookIds):
29
+ # upvotedBookIds = [104, 103, 102, 110, 113, 124, 129, 135, 141, 142, 155, 161, 165, 176, 181, 974, 4443, 1496, 1003, 974, 2600] # TODO REMOVE
30
+ # downvotedBookIds = [126, 179, 183, 184, 187, 9076, 960, 5895, 777, 6902, 2084, 584] # TODO REMOVE
31
+
32
+ # get the max user id in baseRatings
33
+ newUserId = baseRatings['user_id'].max() + 1
34
+
35
+ RATING_FOR_UPVOTE = 5
36
+ RATING_FOR_DOWNVOTE = 1
37
+
38
+ appendBookIds = []
39
+ appendBookRatings = []
40
+ appendUserIds = []
41
+
42
+ for bookId in upvotedBookIds:
43
+ appendBookIds.append(bookId)
44
+ appendBookRatings.append(RATING_FOR_UPVOTE)
45
+ appendUserIds.append(newUserId)
46
+
47
+ for bookId in downvotedBookIds:
48
+ appendBookIds.append(bookId)
49
+ appendBookRatings.append(RATING_FOR_DOWNVOTE)
50
+ appendUserIds.append(newUserId)
51
+
52
+ newUserData = {
53
+ 'book_id': appendBookIds,
54
+ 'user_id': appendUserIds,
55
+ 'rating': appendBookRatings
56
+ }
57
+
58
+ newRows = pd.DataFrame(newUserData)
59
+
60
+ ratings = pd.concat([baseRatings, newRows], ignore_index=True)
61
+
62
+ df = pd.merge(books, ratings, on="book_id", how="inner")
63
+
64
+ user_df = df.groupby(["user_id","title"])["rating"].mean().unstack()
65
+
66
+ targetUserDf = user_df[user_df.index == newUserId]
67
+
68
+ targetBooksRead = targetUserDf.dropna(axis=1).columns.tolist()
69
+
70
+ # for all the books the user has read
71
+ book_read_df = user_df[targetBooksRead]
72
+
73
+ # get counts of every *other* user that has read these
74
+ userBookCount = book_read_df.notnull().sum(axis=1)
75
+
76
+ print('\n\n\n\n\n\n\n\n')
77
+ print('--' * 10)
78
+ print(userBookCount)
79
+
80
+ # from there get users who've read at least X percent of the main user
81
+ minBookCount = book_read_df.shape[1] * SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS
82
+
83
+ # print(minBookCount)
84
+
85
+ usersSameBooks = userBookCount[userBookCount > minBookCount].index
86
+
87
+ # print(f'Users who have read at least 30% of the main user: {len(usersSameBooks)}')
88
+
89
+ # filter the main user's read books df to only include rows from users who've read X percent
90
+ filted_df = book_read_df[book_read_df.index.isin(usersSameBooks)]
91
+
92
+ # convert all NaNs to 0
93
+ filted_df = filted_df.fillna(0)
94
+
95
+ corr_df = filted_df.T.corr().unstack()
96
+
97
+ top_readers = pd.DataFrame(corr_df[newUserId][corr_df[newUserId] > SIMILAR_USER_MIN_CORRELATION], columns=["corr"])
98
+
99
+ print(top_readers)
100
+
101
+ if (newUserId in top_readers.index):
102
+ top_readers = top_readers.drop(newUserId)
103
+
104
+ # get the ratings for the top readers
105
+ top_readers_ratings = pd.merge(top_readers, df[["user_id", "book_id", "rating"]], how='inner', on="user_id")
106
+
107
+ # weight their ratings by how correlated they are with the user
108
+ top_readers_ratings['weighted_rating'] = top_readers_ratings['corr'] * top_readers_ratings['rating']
109
+
110
+ # # make a pivot table from the books and their new weighted rating
111
+ recommendation_df = top_readers_ratings.pivot_table(values="weighted_rating", index="book_id", aggfunc="mean")
112
+
113
+ # set all books in upvotedBookIds/downvotedBookIds to 0 in the recommendation_df if they are in there
114
+ recommendation_df.loc[recommendation_df.index.isin(upvotedBookIds)] = 0
115
+ recommendation_df.loc[recommendation_df.index.isin(downvotedBookIds)] = 0
116
+
117
+ # sort the books by their weighted rating
118
+ books_recommend = recommendation_df[recommendation_df["weighted_rating"] > 1].sort_values(by="weighted_rating", ascending=False).head(20)
119
+
120
+ # get the recommended books (and sort by average_rating)
121
+ recommendedBooks = books[books["book_id"].isin(books_recommend.index)].sort_values(by="average_rating", ascending=False)
122
+ # drop book_id column
123
+ recommendedBooks = recommendedBooks.drop(columns=["book_id"])
124
+
125
+ return {
126
+ "upvotedBookIds": upvotedBookIds,
127
+ "downvotedBookIds": downvotedBookIds,
128
+ "numSimilarUsers" : len(usersSameBooks),
129
+ "recommendedBooksData": recommendedBooks,
130
+ # sort by correlation
131
+ "topCorrelatedReadersData" : top_readers.sort_values(by="corr", ascending=False)
132
+ }