jchen8000's picture
Create app.py
725c528 verified
raw
history blame
2.02 kB
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import gradio as gr
# Load the MovieLens dataset
movies = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-latest-small.zip', compression='zip')
# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')
# Replace NaN with an empty string
movies['genres'] = movies['genres'].fillna('')
# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies['genres'])
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# Construct a reverse map of indices and movie titles
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
# Get the index of the movie that matches the title
idx = indices[title]
# Get the pairwise similarity scores of all movies with that movie
sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the movies based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 10 most similar movies
sim_scores = sim_scores[1:11]
# Get the movie indices
movie_indices = [i[0] for i in sim_scores]
# Return the top 10 most similar movies
return movies['title'].iloc[movie_indices]
# Gradio interface
def recommend_movies(movie):
recommendations = get_recommendations(movie)
return recommendations.tolist()
# Create the Gradio interface
movie_list = movies['title'].tolist()
iface = gr.Interface(fn=recommend_movies, inputs=gr.inputs.Dropdown(movie_list), outputs="text", title="Movie Recommender", description="Select a movie to get recommendations based on content filtering.")
# Launch the app
iface.launch()