File size: 3,114 Bytes
725c528
 
 
 
30059ee
 
 
88a2d43
 
e28d195
35aecae
725c528
30059ee
 
 
 
 
 
725c528
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5254954
35aecae
725c528
 
 
 
5254954
9c81a28
 
725c528
 
 
 
693eb08
993d08d
c2e19f2
725c528
 
88a2d43
993d08d
7dbbd88
 
 
924915f
7dbbd88
88a2d43
9dba8e7
c2e19f2
 
7dbbd88
 
 
 
 
725c528
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import gradio as gr
import zipfile
import requests
import io
import random

input_count = 300
result_count = 21

# Download and extract the MovieLens dataset
url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
response = requests.get(url)
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    with z.open('ml-latest-small/movies.csv') as f:
        movies = pd.read_csv(f)

# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
movies['genres'] = movies['genres'].fillna('')

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Construct a reverse map of indices and movie titles
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 20 most similar movies
    sim_scores = sim_scores[1:result_count]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 20 most similar movies with their scores
    recommendations = [(movies['title'].iloc[i], sim_scores[idx][1]) for idx, i in enumerate(movie_indices)]
    return recommendations

# Gradio interface
def recommend_movies(movie):
    recommendations = get_recommendations(movie)
   
    format_string = "{:>5.2f}       {:<20}"
    return "Score     Title\n" + "\n".join([format_string.format(score, title) for title, score in recommendations])

# Create the Gradio interface
movie_list = random.sample(movies['title'].tolist(), input_count)
total_movies = len(movies)

with gr.Blocks() as iface:
    with gr.Tab("Content-Based Filtering"):
        # gr.Markdown("## Recommendation - Content-Based Filtering")
        gr.Interface(fn=recommend_movies, 
                     inputs=gr.Dropdown(movie_list, label=f"Select a Movie (Total movies: {total_movies}, randomly list {input_count} for demo purpose.)"), 
                     outputs=[gr.Textbox(label="Recommended Movies:")],
                     title="Movie Recommender - Content-Based Filtering", 
                     description="Select a movie to get recommendations based on content filtering.")
    
    with gr.Tab("Collaborative Filtering"):
        gr.Markdown("## Recommendation - Collaborative Filtering")
        gr.Markdown("### In construction")
        
# Launch the app
iface.launch()