{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "SKRYfHwWyVaG" }, "outputs": [], "source": [ "# Importing Libraries\n", "import numpy as np\n", "import pandas as pd\n", "import sklearn\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import warnings\n", "warnings.simplefilter(action='ignore', category=FutureWarning)" ] }, { "cell_type": "code", "source": [ "#loading rating dataset\n", "ratings = pd.read_csv(\"https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv\")\n", "print(ratings.head())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "v_ZFn93Wy1ho", "outputId": "97f98476-d909-4050-bc37-68369391d756" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " userId movieId rating timestamp\n", "0 1 1 4.0 964982703\n", "1 1 3 4.0 964981247\n", "2 1 6 4.0 964982224\n", "3 1 47 5.0 964983815\n", "4 1 50 5.0 964982931\n" ] } ] }, { "cell_type": "code", "source": [ "# loading movie dataset\n", "movies = pd.read_csv(\"https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv\")\n", "print(movies.head())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_RCPOQWfy269", "outputId": "4c3c68a1-dbbb-4795-d96a-4f9c11d3731b" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " movieId title \\\n", "0 1 Toy Story (1995) \n", "1 2 Jumanji (1995) \n", "2 3 Grumpier Old Men (1995) \n", "3 4 Waiting to Exhale (1995) \n", "4 5 Father of the Bride Part II (1995) \n", "\n", " genres \n", "0 Adventure|Animation|Children|Comedy|Fantasy \n", "1 Adventure|Children|Fantasy \n", "2 Comedy|Romance \n", "3 Comedy|Drama|Romance \n", "4 Comedy \n" ] } ] }, { "cell_type": "code", "source": [ "n_ratings = len(ratings)\n", "n_movies = len(ratings['movieId'].unique())\n", "n_users = len(ratings['userId'].unique())\n", "\n", "print(f\"Number of ratings: {n_ratings}\")\n", "print(f\"Number of unique movieId's: {n_movies}\")\n", "print(f\"Number of unique users: {n_users}\")\n", "print(f\"Average ratings per user: {round(n_ratings/n_users, 2)}\")\n", "print(f\"Average ratings per movie: {round(n_ratings/n_movies, 2)}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ypivRYgqy4kb", "outputId": "360eef9e-9186-4ed6-ed50-fe8e6a3fabf0" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Number of ratings: 100836\n", "Number of unique movieId's: 9724\n", "Number of unique users: 610\n", "Average ratings per user: 165.3\n", "Average ratings per movie: 10.37\n" ] } ] }, { "cell_type": "code", "source": [ "user_freq = ratings[['userId', 'movieId']].groupby(\n", " 'userId').count().reset_index()\n", "user_freq.columns = ['userId', 'n_ratings']\n", "print(user_freq.head())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PYZsye4-zAfi", "outputId": "fb38061d-d9bc-4552-de9b-de418780ec32" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " userId n_ratings\n", "0 1 232\n", "1 2 29\n", "2 3 39\n", "3 4 216\n", "4 5 44\n" ] } ] }, { "cell_type": "code", "source": [ "# Find Lowest and Highest rated movies:\n", "mean_rating = ratings.groupby('movieId')[['rating']].mean()\n", "# Lowest rated movies\n", "lowest_rated = mean_rating['rating'].idxmin()\n", "movies.loc[movies['movieId'] == lowest_rated]\n", "# Highest rated movies\n", "highest_rated = mean_rating['rating'].idxmax()\n", "movies.loc[movies['movieId'] == highest_rated]\n", "# show number of people who rated movies rated movie highest\n", "ratings[ratings['movieId']==highest_rated]\n", "# show number of people who rated movies rated movie lowest\n", "ratings[ratings['movieId']==lowest_rated]\n", "\n", "## the above movies has very low dataset. We will use bayesian average\n", "movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])\n", "movie_stats.columns = movie_stats.columns.droplevel()" ], "metadata": { "id": "H1s9d6QIzBzv" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Now, we create user-item matrix using scipy csr matrix\n", "from scipy.sparse import csr_matrix\n", "\n", "def create_matrix(df):\n", "\n", " N = len(df['userId'].unique())\n", " M = len(df['movieId'].unique())\n", "\n", " # Map Ids to indices\n", " user_mapper = dict(zip(np.unique(df[\"userId\"]), list(range(N))))\n", " movie_mapper = dict(zip(np.unique(df[\"movieId\"]), list(range(M))))\n", "\n", " # Map indices to IDs\n", " user_inv_mapper = dict(zip(list(range(N)), np.unique(df[\"userId\"])))\n", " movie_inv_mapper = dict(zip(list(range(M)), np.unique(df[\"movieId\"])))\n", "\n", " user_index = [user_mapper[i] for i in df['userId']]\n", " movie_index = [movie_mapper[i] for i in df['movieId']]\n", "\n", " X = csr_matrix((df[\"rating\"], (movie_index, user_index)), shape=(M, N))\n", "\n", " return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper\n", "\n", "X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)" ], "metadata": { "id": "2tG23gzjzDLg" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "\"\"\"\n", "Find similar movies using KNN\n", "\"\"\"\n", "from sklearn.neighbors import NearestNeighbors\n", "def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):\n", "\n", " neighbour_ids = []\n", "\n", " movie_ind = movie_mapper[movie_id]\n", " movie_vec = X[movie_ind]\n", " k+=1\n", " kNN = NearestNeighbors(n_neighbors=k, algorithm=\"brute\", metric=metric)\n", " kNN.fit(X)\n", " movie_vec = movie_vec.reshape(1,-1)\n", " neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)\n", " for i in range(0,k):\n", " n = neighbour.item(i)\n", " neighbour_ids.append(movie_inv_mapper[n])\n", " neighbour_ids.pop(0)\n", " return neighbour_ids\n", "\n", "\n", "movie_titles = dict(zip(movies['movieId'], movies['title']))\n", "\n", "movie_id = 3\n", "\n", "similar_ids = find_similar_movies(movie_id, X, k=10)\n", "movie_title = movie_titles[movie_id]\n", "\n", "print(f\"Since you watched {movie_title}\")\n", "for i in similar_ids:\n", " print(movie_titles[i])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "onBGmGk5zGAZ", "outputId": "c5a409ff-c16d-413f-9339-a2ff977eef69" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Since you watched Grumpier Old Men (1995)\n", "Grumpy Old Men (1993)\n", "Striptease (1996)\n", "Nutty Professor, The (1996)\n", "Twister (1996)\n", "Father of the Bride Part II (1995)\n", "Broken Arrow (1996)\n", "Bio-Dome (1996)\n", "Truth About Cats & Dogs, The (1996)\n", "Sabrina (1995)\n", "Birdcage, The (1996)\n" ] } ] }, { "cell_type": "code", "source": [ "def recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10):\n", " df1 = ratings[ratings['userId'] == user_id]\n", "\n", " if df1.empty:\n", " print(f\"User with ID {user_id} does not exist.\")\n", " return\n", "\n", " movie_id = df1[df1['rating'] == max(df1['rating'])]['movieId'].iloc[0]\n", "\n", " movie_titles = dict(zip(movies['movieId'], movies['title']))\n", "\n", " similar_ids = find_similar_movies(movie_id, X, k)\n", " movie_title = movie_titles.get(movie_id, \"Movie not found\")\n", "\n", " if movie_title == \"Movie not found\":\n", " print(f\"Movie with ID {movie_id} not found.\")\n", " return\n", "\n", " print(f\"Since you watched {movie_title}, you might also like:\")\n", " for i in similar_ids:\n", " print(movie_titles.get(i, \"Movie not found\"))" ], "metadata": { "id": "PrN_SjhMzHxy" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "user_id = 150 # Replace with the desired user ID\n", "recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "L13wNuTOzJry", "outputId": "5316c18d-8323-4fc8-8ed0-a503edf93f29" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Since you watched Twelve Monkeys (a.k.a. 12 Monkeys) (1995), you might also like:\n", "Pulp Fiction (1994)\n", "Terminator 2: Judgment Day (1991)\n", "Independence Day (a.k.a. ID4) (1996)\n", "Seven (a.k.a. Se7en) (1995)\n", "Fargo (1996)\n", "Fugitive, The (1993)\n", "Usual Suspects, The (1995)\n", "Jurassic Park (1993)\n", "Star Wars: Episode IV - A New Hope (1977)\n", "Heat (1995)\n" ] } ] }, { "cell_type": "code", "source": [ "user_id = 415 # Replace with the desired user ID\n", "recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pEZ5ISP8zLB1", "outputId": "e61b8c3f-db5d-4c49-b876-5bc19a490ce4" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Since you watched Pulp Fiction (1994), you might also like:\n", "Silence of the Lambs, The (1991)\n", "Shawshank Redemption, The (1994)\n", "Seven (a.k.a. Se7en) (1995)\n", "Forrest Gump (1994)\n", "Usual Suspects, The (1995)\n", "Braveheart (1995)\n", "Fight Club (1999)\n", "Fargo (1996)\n", "Terminator 2: Judgment Day (1991)\n", "Reservoir Dogs (1992)\n" ] } ] } ] }