Spaces:

conlan
/

book-rater

Sleeping

App Files Files Community

conlan commited on Aug 3, 2024

Commit

8189b4d

1 Parent(s): 6c5993d

Update to v2 of book rater

Browse files

Files changed (2) hide show

app.py +110 -54
bookdb.py +132 -0

app.py CHANGED Viewed

@@ -1,79 +1,135 @@
 import pickle
 import streamlit as st
 import numpy as np
-st.header("Book Recommender System")
-model = pickle.load(open("artifacts/model.pkl", "rb"))
-book_names = pickle.load(open("artifacts/book_names.pkl", "rb"))
-final_ratings = pickle.load(open("artifacts/final_ratings.pkl", "rb"))
-book_pivot = pickle.load(open("artifacts/book_pivot.pkl", "rb"))
-def fetch_poster(suggestion):
-    bookNames = []
-    idsIndex = []
-    posterUrl = []
-    for bookId in suggestion[0]:
-        name = book_pivot.index[bookId]
-        bookNames.append(book_pivot.index[bookId])
-    for name in bookNames:
-        ids = np.where(final_ratings['title'] == name)[0][0]
-        idsIndex.append(ids)
-    for idx in idsIndex:
-        row = final_ratings.iloc[idx]
-        url = row['img_url']
-        posterUrl.append(url)
-    return posterUrl
-def recommend_book(bookName):
-    bookList = []
-    book_id = np.where(book_pivot.index == bookName)[0][0]
-    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1, -1), n_neighbors=5)
-    poster_url = fetch_poster(suggestion)
-    for i in range(len(suggestion)):
-        books = book_pivot.index[suggestion[i]]
-        for j in books:
-            bookList.append(j)
-    return bookList, poster_url
-selected_books = st.selectbox(
-    "Select a book",
-    book_names
-)
-if st.button("Show Recommendations"):
-    recommendations, posterUrls = recommend_book(selected_books)
     st.subheader("Recommendations")
-    col1, col2, col3, col4 = st.columns(4)
-    for url in posterUrls:
-        print(url)
-    with col1:
-        st.text(recommendations[1])
-        st.image(posterUrls[1])
-    with col2:
-        st.text(recommendations[2])
-        st.image(posterUrls[2])
-    with col3:
-        st.text(recommendations[3])
-        st.image(posterUrls[3])
-    with col4:
-        st.text(recommendations[4])
-        st.image(posterUrls[4])

 import pickle
 import streamlit as st
 import numpy as np
+import bookdb
+st.header("My Book Buddy 🐛")
+if "upvoted_book_ids" not in st.session_state:
+    st.session_state["upvoted_book_ids"] = []
+if "downvoted_book_ids" not in st.session_state:
+    st.session_state["downvoted_book_ids"] = []
+if "recommendedBooksData" not in st.session_state:
+    st.session_state["recommendedBooksData"] = None
+if "topCorrelatedReadersData" not in st.session_state:
+    st.session_state["topCorrelatedReadersData"] = None
+if "numSimilarUsers" not in st.session_state:
+    st.session_state["numSimilarUsers"] = 0
+def update_display(displayData):
+    st.session_state["upvoted_book_ids"] = displayData["upvotedBookIds"]
+    st.session_state["downvoted_book_ids"] = displayData["downvotedBookIds"]
+    st.session_state["numSimilarUsers"] = displayData["numSimilarUsers"]
+    st.session_state["recommendedBooksData"] = displayData["recommendedBooksData"]
+    st.session_state["topCorrelatedReadersData"] = displayData["topCorrelatedReadersData"]
+def on_reset_votes():
+    st.session_state["upvoted_book_ids"] = []
+    st.session_state["downvoted_book_ids"] = []
+    st.session_state["numSimilarUsers"] = 0
+    st.session_state["recommendedBooksData"] = None
+    st.session_state["topCorrelatedReadersData"] = None
+def on_submit_votes():
+    upvoteBookTitles = st.session_state["multiselect_upvote"]
+    downvoteBookTitles = st.session_state["multiselect_downvote"]
+    if len(upvoteBookTitles) == 0 and len(downvoteBookTitles) == 0:
+        st.warning("Please select at least one book to upvote or downvote")
+        return
+    booksToUpvote = bookdb.get_book_ids_by_title(upvoteBookTitles)
+    booksToDownvote = bookdb.get_book_ids_by_title(downvoteBookTitles)
+    # get the currently voted book ids
+    upvotedBookIds = st.session_state["upvoted_book_ids"]
+    downvotedBookIds = st.session_state["downvoted_book_ids"]
+    if (len(upvotedBookIds) + len(booksToUpvote)) == 0 or (len(downvotedBookIds) + len(booksToDownvote)) == 0:
+        st.warning("You must select at least one book to upvote and one book to downvote")
+        return
+    # append booksToUpvote to upvotedBookIds
+    upvotedBookIds.extend(booksToUpvote)
+    # append booksToDownvote to downvotedBookIds
+    downvotedBookIds.extend(booksToDownvote)
+    # remove any upvoted books from downvotedBookIds if they are in there
+    downvotedBookIds = [x for x in downvotedBookIds if x not in upvotedBookIds]
+    # clear the multiselects
+    st.session_state.multiselect_upvote = []
+    st.session_state.multiselect_downvote = []
+    update_display(bookdb.update_user_ratings(upvotedBookIds, downvotedBookIds))
+with st.form(key='upvote_form'):
+    col1, col2 = st.columns(2)
+    allBookTitles = bookdb.get_all_book_titles()
+    myRatedBookTitles = bookdb.get_book_titles(st.session_state["upvoted_book_ids"] + st.session_state["downvoted_book_ids"])
+    # remove myRatedBookTitles from allBookTitles
+    remainingBookTitles = [x for x in allBookTitles if x not in myRatedBookTitles]
+    col1.multiselect(
+        'Upvote Books 👍',
+        remainingBookTitles,
+        key='multiselect_upvote'
+    )
+    col2.multiselect(
+        'Downvote Books 👎',
+        remainingBookTitles,
+        key='multiselect_downvote'
+    )
+    st.form_submit_button(label='Submit', type="primary", on_click=on_submit_votes)
+if st.session_state["recommendedBooksData"] is not None:
+    df = st.session_state["recommendedBooksData"]
     st.subheader("Recommendations")
+    st.dataframe(df, hide_index=True, use_container_width=True)
+    st.button('Reset All Ratings', type="secondary", on_click=on_reset_votes)
+    st.subheader("Your Ratings")
+    displayCol1, displayCol2 = st.columns(2)
+    if len(st.session_state["upvoted_book_ids"]) > 0:
+        upvotedBookIds = st.session_state["upvoted_book_ids"]
+        displayCol1.markdown(f"Upvoted: {len(upvotedBookIds)} book(s)")
+        for bookId in upvotedBookIds:
+            displayCol1.markdown(f'  - {bookId}-{bookdb.get_book_title(bookId)}')
+    if len(st.session_state["downvoted_book_ids"]) > 0:
+        downvotedBookIds = st.session_state["downvoted_book_ids"]
+        displayCol2.markdown(f'Downvoted: {len(downvotedBookIds)} book(s)')
+        for bookId in downvotedBookIds:
+            displayCol2.markdown(f'  - {bookId}-{bookdb.get_book_title(bookId)}')
+    st.divider()
+    # st.write(f"Similar User Min Percent Shared Books = {round(bookdb.SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS * 100)}%")
+    # st.write(f"Similar User Min Correlation = {bookdb.SIMILAR_USER_MIN_CORRELATION}")
+    # if "numSimilarUsers" in st.session_state:
+        # st.write(f"{st.session_state['numSimilarUsers']} similar users")
+    if st.session_state["topCorrelatedReadersData"] is not None:
+        df = st.session_state["topCorrelatedReadersData"]
+        st.subheader("Top Correlated Readers")
+        st.dataframe(df, use_container_width=True)

bookdb.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import pandas as pd
+SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS = 0.30
+SIMILAR_USER_MIN_CORRELATION = 0.25
+books = pd.read_csv("./goodreads/books.csv",
+                 usecols=["book_id",
+                        #   "original_publication_year",
+                        #   "average_rating",
+                          "title",
+                          "average_rating"])
+books['book_id'] = range(1, len(books) + 1)
+baseRatings = pd.read_csv("./goodreads/ratings.csv")
+def get_book_ids_by_title(book_titles):
+    return books[books["title"].isin(book_titles)]["book_id"].values
+def get_all_book_titles():
+    return books["title"].values
+def get_book_title(book_id):
+    return books[books["book_id"] == book_id]["title"].values[0]
+def get_book_titles(book_ids):
+    return books[books["book_id"].isin(book_ids)]["title"].values
+def update_user_ratings(upvotedBookIds, downvotedBookIds):
+    # upvotedBookIds =  [104, 103, 102, 110, 113, 124, 129, 135, 141, 142, 155, 161, 165, 176, 181, 974, 4443, 1496, 1003, 974, 2600] # TODO REMOVE
+    # downvotedBookIds =  [126, 179, 183, 184, 187, 9076, 960, 5895, 777, 6902, 2084, 584] # TODO REMOVE
+    # get the max user id in baseRatings
+    newUserId = baseRatings['user_id'].max() + 1
+    RATING_FOR_UPVOTE = 5
+    RATING_FOR_DOWNVOTE = 1
+    appendBookIds = []
+    appendBookRatings = []
+    appendUserIds = []
+    for bookId in upvotedBookIds:
+        appendBookIds.append(bookId)
+        appendBookRatings.append(RATING_FOR_UPVOTE)
+        appendUserIds.append(newUserId)
+    for bookId in downvotedBookIds:
+        appendBookIds.append(bookId)
+        appendBookRatings.append(RATING_FOR_DOWNVOTE)
+        appendUserIds.append(newUserId)
+    newUserData = {
+        'book_id': appendBookIds,
+        'user_id': appendUserIds,
+        'rating': appendBookRatings
+    }
+    newRows = pd.DataFrame(newUserData)
+    ratings = pd.concat([baseRatings, newRows], ignore_index=True)
+    df = pd.merge(books, ratings, on="book_id", how="inner")
+    user_df = df.groupby(["user_id","title"])["rating"].mean().unstack()
+    targetUserDf = user_df[user_df.index == newUserId]
+    targetBooksRead = targetUserDf.dropna(axis=1).columns.tolist()
+    # for all the books the user has read
+    book_read_df = user_df[targetBooksRead]
+    # get counts of every *other* user that has read these
+    userBookCount = book_read_df.notnull().sum(axis=1)
+    print('\n\n\n\n\n\n\n\n')
+    print('--' * 10)
+    print(userBookCount)
+    # from there get users who've read at least X percent of the main user
+    minBookCount = book_read_df.shape[1] * SIMILAR_USER_MIN_PERCENT_SHARED_BOOKS
+    # print(minBookCount)
+    usersSameBooks = userBookCount[userBookCount > minBookCount].index
+    # print(f'Users who have read at least 30% of the main user: {len(usersSameBooks)}')
+    # filter the main user's read books df to only include rows from users who've read X percent
+    filted_df = book_read_df[book_read_df.index.isin(usersSameBooks)]
+    # convert all NaNs to 0
+    filted_df = filted_df.fillna(0)
+    corr_df = filted_df.T.corr().unstack()
+    top_readers = pd.DataFrame(corr_df[newUserId][corr_df[newUserId] > SIMILAR_USER_MIN_CORRELATION], columns=["corr"])
+    print(top_readers)
+    if (newUserId in top_readers.index):
+        top_readers = top_readers.drop(newUserId)
+    # get the ratings for the top readers
+    top_readers_ratings = pd.merge(top_readers, df[["user_id", "book_id", "rating"]], how='inner', on="user_id")
+    # weight their ratings by how correlated they are with the user
+    top_readers_ratings['weighted_rating'] = top_readers_ratings['corr'] * top_readers_ratings['rating']
+    # # make a pivot table from the books and their new weighted rating
+    recommendation_df = top_readers_ratings.pivot_table(values="weighted_rating", index="book_id", aggfunc="mean")
+    # set all books in upvotedBookIds/downvotedBookIds to 0 in the recommendation_df if they are in there
+    recommendation_df.loc[recommendation_df.index.isin(upvotedBookIds)] = 0
+    recommendation_df.loc[recommendation_df.index.isin(downvotedBookIds)] = 0
+    # sort the books by their weighted rating
+    books_recommend = recommendation_df[recommendation_df["weighted_rating"] > 1].sort_values(by="weighted_rating", ascending=False).head(20)
+    # get the recommended books (and sort by average_rating)
+    recommendedBooks = books[books["book_id"].isin(books_recommend.index)].sort_values(by="average_rating", ascending=False)
+    # drop book_id column
+    recommendedBooks = recommendedBooks.drop(columns=["book_id"])
+    return {
+        "upvotedBookIds": upvotedBookIds,
+        "downvotedBookIds": downvotedBookIds,
+        "numSimilarUsers" : len(usersSameBooks),
+        "recommendedBooksData": recommendedBooks,
+        # sort by correlation
+        "topCorrelatedReadersData" : top_readers.sort_values(by="corr", ascending=False)
+    }