{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "30252107", "metadata": {}, "outputs": [], "source": [ "import os\n", "import random\n", "import operator\n", "import requests\n", "import numpy as np\n", "import pandas as pd\n", "from scipy import sparse\n", "import sys\n", "from surprise import Dataset, Reader\n", "from surprise import KNNBasic, SVD\n", "from surprise.model_selection import train_test_split\n", "from surprise import accuracy\n", "from surprise.dataset import DatasetAutoFolds" ] }, { "cell_type": "code", "execution_count": 2, "id": "c40008b6", "metadata": {}, "outputs": [], "source": [ "df1 = pd.read_csv('Book reviews\\BX-Users.csv', sep=';', encoding='ISO-8859-1')\n", "df2 = pd.read_csv('Book reviews\\BX_Books.csv', sep=';', encoding='ISO-8859-1')\n", "df3 = pd.read_csv('Book reviews\\BX-Book-Ratings.csv', sep=';', encoding='ISO-8859-1', nrows=20_000)" ] }, { "cell_type": "code", "execution_count": 3, "id": "a422a310", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2180" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_ids = df3['User-ID'].tolist()\n", "user_id = []\n", "for i in user_ids:\n", " if i in user_id:\n", " continue\n", " else:\n", " user_id.append(i)\n", "len(user_id)" ] }, { "cell_type": "code", "execution_count": 4, "id": "fea227ef", "metadata": {}, "outputs": [], "source": [ "data = df3" ] }, { "cell_type": "code", "execution_count": 5, "id": "663d5ba4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 12660\n", "8 1694\n", "7 1526\n", "10 1272\n", "9 1105\n", "5 728\n", "6 663\n", "4 170\n", "3 108\n", "2 45\n", "1 29\n", "Name: Book-Rating, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3['Book-Rating'].value_counts()" ] }, { "cell_type": "code", "execution_count": 6, "id": "c85ef134", "metadata": {}, "outputs": [], "source": [ "n=len(df3)\n", "N=list(range(n))\n", "random.seed(2023)\n", "random.shuffle(N)" ] }, { "cell_type": "code", "execution_count": 7, "id": "beb6246d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
User-IDISBNBook-Rating
15849244288452529060
1134971237844194458
173227742705535792740
183333363055321316410
1180688205538019450
\n", "
" ], "text/plain": [ " User-ID ISBN Book-Rating\n", "15849 2442 8845252906 0\n", "11349 712 3784419445 8\n", "1732 277427 0553579274 0\n", "18333 3363 0553213164 10\n", "11806 882 0553801945 0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train=data.iloc[N[0:(n*4)//5]]\n", "test=data.iloc[N[(n*4)//5:]]\n", "train.tail()" ] }, { "cell_type": "code", "execution_count": 8, "id": "f27ca18d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0, 9, 10, 2, 7, 5, 8, 6, 1, 4, 3]\n", "1912\n", "14033\n" ] } ], "source": [ "print(train['Book-Rating'].unique().tolist())\n", "print(len(train['User-ID'].unique().tolist()))\n", "print(len(train['ISBN'].unique().tolist()))" ] }, { "cell_type": "code", "execution_count": 9, "id": "94ebe1ac", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "reader = Reader(rating_scale=(1,10)) # rating scale range\n", "trainset = Dataset.load_from_df(train[['User-ID','ISBN','Book-Rating']],reader).build_full_trainset()\n", "print(type(trainset))" ] }, { "cell_type": "code", "execution_count": 10, "id": "25d0a6ff", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Computing the msd similarity matrix...\n", "Done computing similarity matrix.\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Use the KNNBasic algorithm to train the model\n", "algo = KNNBasic()\n", "#algo = SVD()\n", "algo.fit(trainset)" ] }, { "cell_type": "code", "execution_count": 11, "id": "a4155aff", "metadata": {}, "outputs": [], "source": [ "testset = Dataset.load_from_df(test[['User-ID','ISBN','Book-Rating']],reader).build_full_trainset().build_anti_testset()" ] }, { "cell_type": "code", "execution_count": 12, "id": "376eb001", "metadata": {}, "outputs": [], "source": [ "items = trainset.build_anti_testset()" ] }, { "cell_type": "code", "execution_count": 13, "id": "dc366b18", "metadata": {}, "outputs": [], "source": [ "books = df2\n", "mapping_dict = books.set_index(\"ISBN\")[\"Book-Title\"].to_dict()" ] }, { "cell_type": "code", "execution_count": 14, "id": "2fe30c88", "metadata": {}, "outputs": [], "source": [ "users=test['User-ID'].tolist()" ] }, { "cell_type": "code", "execution_count": 29, "id": "afc45dd3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1928" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "random.seed()\n", "rd = random.randint(0,len(users))\n", "users[rd]" ] }, { "cell_type": "code", "execution_count": 30, "id": "1fff04c1", "metadata": {}, "outputs": [], "source": [ "user = users[rd]\n", "user_items = list(filter(lambda x: x[0] == user, items))\n", "recommendations = algo.test(user_items)\n", "recommendations.sort(key=operator.itemgetter(3), reverse=True)" ] }, { "cell_type": "code", "execution_count": 31, "id": "de2718ce", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User 1928 Recommendation Top 5:\n", " [Item] Four Blind Mice, [Estimated Rating] 10\n", " [Item] KJV Giant Print Reference Bible, Personal Size Bronze Edition, [Estimated Rating] 10\n", " [Item] So You Want to Be a Wizard: The First Book in the Young Wizards Series, [Estimated Rating] 10\n", " [Item] The Princess Diaries, [Estimated Rating] 10\n", " [Item] Memoirs of a Geisha, [Estimated Rating] 10\n" ] } ], "source": [ "print(f\"User {user} Recommendation Top 5:\")\n", "for r in recommendations[0:5]:\n", " try: \n", " print(f\" [Item] {mapping_dict[r[1]]}, [Estimated Rating] {round(r[3],3)}\")\n", " except:\n", " continue" ] }, { "cell_type": "code", "execution_count": null, "id": "bf7c2fcb", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }