Commit
·
61faab2
1
Parent(s):
a684e88
Fully pickled recomms
Browse files- core.py +9 -130
- requirements.txt +1 -4
- user_recomms.pkl +3 -0
core.py
CHANGED
@@ -1,17 +1,3 @@
|
|
1 |
-
import pip
|
2 |
-
|
3 |
-
def install(package):
|
4 |
-
if hasattr(pip, 'main'):
|
5 |
-
pip.main(['install', package])
|
6 |
-
else:
|
7 |
-
pip._internal.main(['install', package])
|
8 |
-
|
9 |
-
print("Everything goes bang.")
|
10 |
-
install('torch_geometric')
|
11 |
-
install('torch_scatter')
|
12 |
-
install('torch_sparse')
|
13 |
-
print("It's havoc baby!")
|
14 |
-
|
15 |
import pickle
|
16 |
import numpy as np
|
17 |
import pandas as pd
|
@@ -19,112 +5,15 @@ import random
|
|
19 |
from tqdm import tqdm
|
20 |
import matplotlib.pyplot as plt
|
21 |
from sklearn.model_selection import train_test_split
|
22 |
-
import torch
|
23 |
-
from torch import nn, optim, Tensor
|
24 |
-
from torch_sparse import SparseTensor, matmul
|
25 |
-
from torch_geometric.utils import structured_negative_sampling
|
26 |
-
from torch_geometric.data import download_url, extract_zip
|
27 |
-
from torch_geometric.nn.conv.gcn_conv import gcn_norm
|
28 |
-
from torch_geometric.nn.conv import MessagePassing
|
29 |
-
from torch_geometric.typing import Adj
|
30 |
from sklearn.neighbors import BallTree
|
31 |
-
from thefuzz import fuzz
|
32 |
-
from thefuzz import process
|
33 |
-
|
34 |
-
class LightGCN(MessagePassing):
|
35 |
-
def __init__(self, num_users, num_items, embedding_dim=64, diffusion_steps=3, add_self_loops=False):
|
36 |
-
super().__init__()
|
37 |
-
|
38 |
-
# Number of users and items in the graph
|
39 |
-
self.num_users = num_users
|
40 |
-
self.num_items = num_items
|
41 |
-
|
42 |
-
# Embedding dimension for user and item nodes
|
43 |
-
self.embedding_dim = embedding_dim
|
44 |
-
|
45 |
-
# Number of diffusion steps (K) for multi-scale diffusion
|
46 |
-
self.diffusion_steps = diffusion_steps
|
47 |
-
|
48 |
-
# Whether to add self-loops to the adjacency matrix
|
49 |
-
self.add_self_loops = add_self_loops
|
50 |
-
|
51 |
-
# Initialize embeddings for users and items (E^0)
|
52 |
-
self.users_emb = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.embedding_dim) # e_u^0
|
53 |
-
self.items_emb = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.embedding_dim) # e_i^0
|
54 |
-
|
55 |
-
# Initialize embedding weights with a normal distribution (mean=0, std=0.1)
|
56 |
-
nn.init.normal_(self.users_emb.weight, std=0.1)
|
57 |
-
nn.init.normal_(self.items_emb.weight, std=0.1)
|
58 |
-
|
59 |
-
def forward(self, edge_index: SparseTensor):
|
60 |
-
# Compute the symmetrically normalized adjacency matrix (A_hat or \tilde{A})
|
61 |
-
edge_index_norm = gcn_norm(edge_index, add_self_loops=self.add_self_loops)
|
62 |
-
|
63 |
-
# Get initial embeddings E^0 for all nodes (users and items)
|
64 |
-
emb_0 = torch.cat([self.users_emb.weight, self.items_emb.weight]) # E^0
|
65 |
-
|
66 |
-
# List to store embeddings at each diffusion step (E^1, E^2, ..., E^K)
|
67 |
-
embs = [emb_0]
|
68 |
-
|
69 |
-
# Initialize the current embeddings to E^0
|
70 |
-
emb_k = emb_0
|
71 |
-
|
72 |
-
# Perform multi-scale diffusion for K steps
|
73 |
-
for _ in range(self.diffusion_steps):
|
74 |
-
# Propagate embeddings and update emb_k using the normalized adjacency matrix
|
75 |
-
emb_k = self.propagate(edge_index_norm, x=emb_k)
|
76 |
-
# Save embeddings at each diffusion step for later use
|
77 |
-
embs.append(emb_k)
|
78 |
-
|
79 |
-
# Stack all the embeddings along the second dimension (stack E^0, E^1, ..., E^K)
|
80 |
-
embs = torch.stack(embs, dim=1)
|
81 |
-
|
82 |
-
# Calculate the final embeddings by taking the mean of all diffusion embeddings (E^K)
|
83 |
-
emb_final = torch.mean(embs, dim=1) # E^K
|
84 |
-
|
85 |
-
# Split the final embeddings into user embeddings (e_u^K) and item embeddings (e_i^K)
|
86 |
-
users_emb_final, items_emb_final = torch.split(emb_final, [self.num_users, self.num_items]) # Splits into e_u^K and e_i^K
|
87 |
-
|
88 |
-
# Returns the final embeddings for users (e_u^K), initial embeddings for users (e_u^0),
|
89 |
-
# final embeddings for items (e_i^K), and initial embeddings for items (e_i^0)
|
90 |
-
return users_emb_final, self.users_emb.weight, items_emb_final, self.items_emb.weight
|
91 |
-
|
92 |
-
def message(self, x_j: Tensor) -> Tensor:
|
93 |
-
# The message function is an identity function, i.e., it returns x_j itself
|
94 |
-
return x_j
|
95 |
-
|
96 |
-
def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor:
|
97 |
-
# Perform message passing and aggregation using the normalized adjacency matrix (A_hat or \tilde{A})
|
98 |
-
return matmul(adj_t, x)
|
99 |
-
|
100 |
-
|
101 |
-
model = LightGCN(671, 9125)
|
102 |
-
|
103 |
-
def get_movie_recommendations(user_id, num_recomms):
|
104 |
-
# Map the user ID to the corresponding index in the model's user embeddings
|
105 |
-
user_index = user_mapping[user_id]
|
106 |
-
|
107 |
-
# Retrieve the user embedding for the specified user
|
108 |
-
user_embedding = model.users_emb.weight[user_index]
|
109 |
-
|
110 |
-
# Calculate scores for all items using the user embedding
|
111 |
-
scores = model.items_emb.weight @ user_embedding
|
112 |
-
|
113 |
-
# Get the indices of the highest scores, including positive items and additional recommendations
|
114 |
-
values, indices = torch.topk(scores, k=len(user_pos_items[user_id]) + num_recomms)
|
115 |
-
|
116 |
-
# Retrieve the recommended movies that the user has already rated highly
|
117 |
-
rated_movies = [index.cpu().item() for index in indices if index in user_pos_items[user_id]][:num_recomms]
|
118 |
-
rated_movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in rated_movies]
|
119 |
-
|
120 |
-
# Retrieve the suggested movies for the user that they have not rated
|
121 |
-
suggested_movies = [index.cpu().item() for index in indices if index not in user_pos_items[user_id]][:num_recomms]
|
122 |
-
suggested_movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in suggested_movies]
|
123 |
-
|
124 |
-
return rated_movie_ids, suggested_movie_ids
|
125 |
|
126 |
addr = './'
|
127 |
|
|
|
|
|
|
|
|
|
|
|
128 |
model.load_state_dict(torch.load(addr + 'model.pth'))
|
129 |
|
130 |
final_movies_file = open(addr + 'final_movies.pkl', "rb")
|
@@ -184,23 +73,13 @@ def find_closest_user(user_embedding, tree, user_embeddings):
|
|
184 |
return closest_user_embedding
|
185 |
|
186 |
|
187 |
-
def
|
188 |
-
non_numerical_columns = df.select_dtypes(exclude=[float, int]).columns
|
189 |
-
return df.drop(columns=non_numerical_columns, inplace=False)
|
190 |
-
|
191 |
-
def output_list(input_dict, movies_df = movie_embeds, tree = btree, user_embeddings = user_embeds, movies = final_movies):
|
192 |
-
movie_ratings = {}
|
193 |
-
for movie_title in input_dict:
|
194 |
-
matching_title = process.extractOne(movie_title, final_movies['title'].values, scorer=fuzz.partial_token_sort_ratio)[0]
|
195 |
-
index = movies.index[movies['title'] == matching_title].tolist()[0]
|
196 |
-
movie_ratings[index] = input_dict[movie_title]
|
197 |
user_embed = create_user_embedding(movie_ratings, movie_embeds)
|
198 |
# Call the find_closest_user function with the pre-built BallTree
|
199 |
closest_user_embed = find_closest_user(user_embed, tree, user_embeds)
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
return out2
|
204 |
|
205 |
# output_list({1:1,2:2,3:3,4:4,5:5})
|
206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pickle
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
|
|
5 |
from tqdm import tqdm
|
6 |
import matplotlib.pyplot as plt
|
7 |
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from sklearn.neighbors import BallTree
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
addr = './'
|
11 |
|
12 |
+
user_recomms_file = open(addr + 'user_recomms.pkl', "rb")
|
13 |
+
user_recomms = pickle.load(user_recomms_file)
|
14 |
+
user_recomms_file.close()
|
15 |
+
|
16 |
+
|
17 |
model.load_state_dict(torch.load(addr + 'model.pth'))
|
18 |
|
19 |
final_movies_file = open(addr + 'final_movies.pkl', "rb")
|
|
|
73 |
return closest_user_embedding
|
74 |
|
75 |
|
76 |
+
def output_list(movie_ratings, movies_df = movie_embeds, tree = btree, user_embeddings = user_embeds, movies = final_movies):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
user_embed = create_user_embedding(movie_ratings, movie_embeds)
|
78 |
# Call the find_closest_user function with the pre-built BallTree
|
79 |
closest_user_embed = find_closest_user(user_embed, tree, user_embeds)
|
80 |
+
recomms = user_recomms[int(closest_user_embed['userId'])]
|
81 |
+
out = [movies['title'].iloc[movie_id] for movie_id in recomms]
|
82 |
+
return out
|
|
|
83 |
|
84 |
# output_list({1:1,2:2,3:3,4:4,5:5})
|
85 |
|
requirements.txt
CHANGED
@@ -3,7 +3,4 @@ pillow
|
|
3 |
numpy==1.23.5
|
4 |
pandas==1.5.3
|
5 |
thefuzz[speedup]
|
6 |
-
scikit-learn==1.2.2
|
7 |
-
torch==2.0.0
|
8 |
-
torchvision==0.15.1
|
9 |
-
torchaudio==2.0.1
|
|
|
3 |
numpy==1.23.5
|
4 |
pandas==1.5.3
|
5 |
thefuzz[speedup]
|
6 |
+
scikit-learn==1.2.2
|
|
|
|
|
|
user_recomms.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6898b9039369a91e0aa792c09b6bbe8308b5c2d71a297152364bac31699cd60f
|
3 |
+
size 20177
|