import pandas as pd import numpy as np from scipy.sparse import csr_matrix from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import linear_kernel from sklearn.preprocessing import MinMaxScaler import tensorflow as tf from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, Dense import gradio as gr import zipfile import random input_count = 300 result_count = 21 # Extract the MovieLens dataset with zipfile.ZipFile('ml-latest-small.zip') as z: with z.open('ml-latest-small/movies.csv') as f: movies = pd.read_csv(f) with z.open('ml-latest-small/ratings.csv') as f: ratings = pd.read_csv(f) ###################################### # # Content-based Filtering # ###################################### # Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a' tfidf = TfidfVectorizer(stop_words='english') # Replace NaN with an empty string movies['genres'] = movies['genres'].fillna('') # Construct the required TF-IDF matrix by fitting and transforming the data tfidf_matrix = tfidf.fit_transform(movies['genres']) # Compute the cosine similarity matrix cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) # Construct a reverse map of indices and movie titles indices = pd.Series(movies.index, index=movies['title']).drop_duplicates() # Function that takes in movie title as input and outputs most similar movies def get_cb_recommendations(title, cosine_sim=cosine_sim): # Get the index of the movie that matches the title idx = indices[title] # Get the pairwise similarity scores of all movies with that movie sim_scores = list(enumerate(cosine_sim[idx])) # Sort the movies based on the similarity scores sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # Get the scores of the 20 most similar movies sim_scores = sim_scores[1:result_count] # Get the movie indices movie_indices = [i[0] for i in sim_scores] # Return the top 20 most similar movies with their scores recommendations = [(movies['title'].iloc[i], sim_scores[idx][1]) for idx, i in enumerate(movie_indices)] return recommendations # Gradio interface def recommend_movies_cb(movie): if not movie: return "No movie selected. Please select one from the dropdown." recommendations = get_cb_recommendations(movie) format_string = "{:>5.2f} {:<20}" return "Score Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations]) ###################################### # # Collaborative Filtering (Item-based) # ###################################### # Create a movie-user matrix movie_user_matrix = ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0) # Compute the cosine similarity between movies movie_similarity = cosine_similarity(movie_user_matrix) # Create a DataFrame with movie similarities movie_similarity_df = pd.DataFrame(movie_similarity, index=movie_user_matrix.index, columns=movie_user_matrix.index) # Function to get movie recommendations using item-based collaborative filtering def get_cf_recommendations(movie_title, movie_similarity_df=movie_similarity_df, movies=movies, n=result_count): # Get the movieId for the input movie title movie_id = movies[movies['title'] == movie_title]['movieId'].values[0] # Check if the movie is in our similarity matrix if movie_id not in movie_similarity_df.index: return [] # Get the row of similarity scores for this movie similar_scores = movie_similarity_df.loc[movie_id] # Sort the scores in descending order similar_scores = similar_scores.sort_values(ascending=False) # Get the indices of the top-n most similar movies (excluding the input movie itself) similar_movie_indices = similar_scores.index[1:n+1] # Get the titles and similarity scores of the recommended movies recommendations = [] for idx in similar_movie_indices: title = movies.loc[movies['movieId'] == idx, 'title'].values[0] score = similar_scores[idx] recommendations.append((title, score)) return recommendations # Function for Gradio interface def recommend_movies_cf(movie_title): if not movie_title: return "No movie selected. Please select one from the dropdown." if movie_title not in movies['title'].values: return f"Movie '{movie_title}' not found in the dataset." recommendations = get_cf_recommendations(movie_title) format_string = "{:>5.2f} {:<20}" return "Score Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations]) ###################################### # # Collaborative Filtering with Neural Network (Item-based) # ###################################### # Normalize the ratings scaler = MinMaxScaler() movie_user_matrix_scaled = scaler.fit_transform(movie_user_matrix) # Define the autoencoder model input_dim = movie_user_matrix.shape[1] encoding_dim = 32 input_layer = Input(shape=(input_dim,)) encoded = Dense(64, activation='relu')(input_layer) encoded = Dense(encoding_dim, activation='relu')(encoded) decoded = Dense(64, activation='relu')(encoded) decoded = Dense(input_dim, activation='sigmoid')(decoded) autoencoder = Model(input_layer, decoded) autoencoder.compile(optimizer='adam', loss='mean_squared_error') # Train the autoencoder autoencoder.fit(movie_user_matrix_scaled, movie_user_matrix_scaled, epochs=50, batch_size=64, shuffle=True, validation_split=0.2, verbose=0) # Use the trained autoencoder to predict the complete matrix predicted_matrix_scaled = autoencoder.predict(movie_user_matrix_scaled) predicted_matrix = scaler.inverse_transform(predicted_matrix_scaled) # Create a DataFrame with the predicted matrix predicted_matrix_df = pd.DataFrame(predicted_matrix, index=movie_user_matrix.index, columns=movie_user_matrix.columns) # Compute the cosine similarity between movies using the predicted matrix movie_similarity_cfnn = cosine_similarity(predicted_matrix) # Create a DataFrame with movie similarities movie_similarity_cfnn_df = pd.DataFrame(movie_similarity, index=movie_user_matrix.index, columns=movie_user_matrix.index) # Function to get movie recommendations using item-based collaborative filtering def get_cfnn_recommendations(movie_title, movie_similarity_df=movie_similarity_cfnn_df, movies=movies, n=result_count): # Get the movieId for the input movie title movie_id = movies[movies['title'] == movie_title]['movieId'].values[0] # Check if the movie is in our similarity matrix if movie_id not in movie_similarity_df.index: return [] # Get the row of similarity scores for this movie similar_scores = movie_similarity_df.loc[movie_id] # Sort the scores in descending order similar_scores = similar_scores.sort_values(ascending=False) # Get the indices of the top-n most similar movies (excluding the input movie itself) similar_movie_indices = similar_scores.index[1:n+1] # Get the titles and similarity scores of the recommended movies recommendations = [] for idx in similar_movie_indices: title = movies.loc[movies['movieId'] == idx, 'title'].values[0] score = similar_scores[idx] recommendations.append((title, score)) return recommendations # Function for Gradio interface def recommend_movies_cfnn(movie_title): if not movie_title: return "No movie selected. Please select one from the dropdown." if movie_title not in movies['title'].values: return f"Movie '{movie_title}' not found in the dataset." recommendations = get_cfnn_recommendations(movie_title) format_string = "{:>5.2f} {:<20}" return "Score Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations]) ###################################### # # Gradio interface # ###################################### # Create a list of movie titles for the dropdown movie_list = random.sample(movies['title'].tolist(), input_count) total_movies = len(movies) with gr.Blocks() as iface: with gr.Tab("Content-Based Filtering"): gr.Markdown("""## Movie Recommender - Content-Based Filtering How it works: * Use the 'genres' feature of movies, and convert genres into numerical vectors. * For a given movie, find the most similar movies based on the genre similarity. * This approach uses genres of movies only, without considering user preferences or viewing history. * Simple to implement and computationally efficient. """) gr.Interface(fn=recommend_movies_cb, inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"), outputs=[gr.Textbox(label="Recommended Movies:")], # title="Movie Recommender - Content-Based Filtering", description="Select a movie to get recommendations based on content filtering.") with gr.Tab("Collaborative Filtering"): gr.Markdown("""## Movie Recommender - Item-Based Collaborative Filtering How it works: * Create a movie-user matrix where rows represent movies and columns represent users, each cell contains the rating a user gave to a movie, or 0 if no rating exists. * Calculate the cosine similarity between movies based on their rating patterns, results in a movie-movie similarity matrix. * For a given movie, find the most similar movies based on this similarity matrix, and recommend these movies. * Simple to implement and computationally efficient, but doesn't handle sparsity well (when many missing ratings). """) gr.Interface(fn=recommend_movies_cf, inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"), outputs=[gr.Textbox(label="Recommended Movies:")], # title="Movie Recommender - Item-Based Collaborative Filtering", description="Select a movie to get recommendations based on collaborative filtering.") with gr.Tab("Collaborative Filtering with Neural Network"): gr.Markdown("""## Movie Recommender - Item-Based Collaborative Filtering with Neural Network How it works: * Use a Neural Network to predict the missing values in the movie-user matrix to improve the collaborative filtering recommendations. * The NN model learns to reconstruct the movie-user matrix, effectively predicting missing ratings. This results in a dense, predicted movie-user matrix. * Calculate movie-movie similarities using the predicted matrix. And use this similarity matrix to find and recommend similar movies. * This approach often provides more accurate recommendations especially with large sparse datasets. But more complex to implement and require more computational resources. """) gr.Interface(fn=recommend_movies_cfnn, inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"), outputs=[gr.Textbox(label="Recommended Movies:")], # title="Movie Recommender - Item-Based Collaborative Filtering", description="Select a movie to get recommendations based on collaborative filtering.") # Launch the app iface.launch()