File size: 3,065 Bytes
725c528
 
 
 
30059ee
88a2d43
 
e28d195
35aecae
725c528
02f4def
 
30059ee
 
725c528
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd07bd3
725c528
 
 
 
 
 
 
 
 
5254954
35aecae
725c528
 
 
 
5254954
9c81a28
 
725c528
 
 
1581906
 
 
725c528
993d08d
c2e19f2
725c528
 
88a2d43
993d08d
7dbbd88
 
 
924915f
7dbbd88
88a2d43
9dba8e7
c2e19f2
 
7dbbd88
 
 
 
 
725c528
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import gradio as gr
import zipfile
import random

input_count = 300
result_count = 21

# Extract the MovieLens dataset
with zipfile.ZipFile('ml-latest-small.zip') as z:
    with z.open('ml-latest-small/movies.csv') as f:
        movies = pd.read_csv(f)

# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
movies['genres'] = movies['genres'].fillna('')

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Construct a reverse map of indices and movie titles
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 20 most similar movies
    sim_scores = sim_scores[1:result_count]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 20 most similar movies with their scores
    recommendations = [(movies['title'].iloc[i], sim_scores[idx][1]) for idx, i in enumerate(movie_indices)]
    return recommendations

# Gradio interface
def recommend_movies(movie):
    if not movie:
        return "No movie selected. Please select one from the dropdown."
        
    recommendations = get_recommendations(movie)
    format_string = "{:>5.2f}       {:<20}"
    return "Score     Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations])

# Create the Gradio interface
movie_list = random.sample(movies['title'].tolist(), input_count)
total_movies = len(movies)

with gr.Blocks() as iface:
    with gr.Tab("Content-Based Filtering"):
        # gr.Markdown("## Recommendation - Content-Based Filtering")
        gr.Interface(fn=recommend_movies, 
                     inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"), 
                     outputs=[gr.Textbox(label="Recommended Movies:")],
                     title="Movie Recommender - Content-Based Filtering", 
                     description="Select a movie to get recommendations based on content filtering.")
    
    with gr.Tab("Collaborative Filtering"):
        gr.Markdown("## Recommendation - Collaborative Filtering")
        gr.Markdown("### In construction")
        
# Launch the app
iface.launch()