import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import linear_kernel import gradio as gr import zipfile import requests import io # Download and extract the MovieLens dataset url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip' response = requests.get(url) with zipfile.ZipFile(io.BytesIO(response.content)) as z: with z.open('ml-latest-small/movies.csv') as f: movies = pd.read_csv(f) # Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a' tfidf = TfidfVectorizer(stop_words='english') # Replace NaN with an empty string movies['genres'] = movies['genres'].fillna('') # Construct the required TF-IDF matrix by fitting and transforming the data tfidf_matrix = tfidf.fit_transform(movies['genres']) # Compute the cosine similarity matrix cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) # Construct a reverse map of indices and movie titles indices = pd.Series(movies.index, index=movies['title']).drop_duplicates() # Function that takes in movie title as input and outputs most similar movies def get_recommendations(title, cosine_sim=cosine_sim): # Get the index of the movie that matches the title idx = indices[title] # Get the pairwise similarity scores of all movies with that movie sim_scores = list(enumerate(cosine_sim[idx])) # Sort the movies based on the similarity scores sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # Get the scores of the 20 most similar movies sim_scores = sim_scores[1:21] # Get the movie indices movie_indices = [i[0] for i in sim_scores] # Return the top 20 most similar movies with their scores recommendations = [(movies['title'].iloc[i], sim_scores[idx][1]) for idx, i in enumerate(movie_indices)] return recommendations # Gradio interface def recommend_movies(movie): recommendations = get_recommendations(movie) format_string = "{:>5.2f} {:<20}" return "Score Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations]) # Create the Gradio interface movie_list = movies['title'].tolist() total_movies = len(movies) iface = gr.Interface(fn=recommend_movies, inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies})"), outputs="text", title="Movie Recommender - Content-Based Filtering", description="Select a movie to get recommendations based on content filtering.") # Launch the app iface.launch()