|
from torch_geometric.datasets import MovieLens100K
|
|
from sklearn.model_selection import train_test_split
|
|
import torch.nn as nn
|
|
import torch
|
|
import torch.nn.functional as F
|
|
import numpy as np
|
|
import pandas as pd
|
|
from torch_geometric.nn import GCNConv, GATConv
|
|
import logging
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', filename='metrics.log')
|
|
|
|
class GNN(torch.nn.Module):
|
|
def __init__(self, model_type, in_channels, hidden_channels, out_channels):
|
|
super(GNN, self).__init__()
|
|
self.model_type = model_type
|
|
self.fc1 = nn.Linear(user_features.shape[1], in_channels)
|
|
self.fc2 = nn.Linear(movie_features.shape[1], in_channels)
|
|
if model_type == 'GCN':
|
|
self.conv1 = GCNConv(in_channels, hidden_channels)
|
|
self.conv2 = GCNConv(hidden_channels, out_channels)
|
|
self.bn1= torch.nn.BatchNorm1d(hidden_channels)
|
|
elif model_type == 'GAT':
|
|
self.conv1 = GATConv(in_channels, hidden_channels, heads=2, concat=True)
|
|
self.conv2 = GATConv(hidden_channels * 2, out_channels, heads=2, concat=True)
|
|
self.bn1= torch.nn.BatchNorm1d(hidden_channels*2)
|
|
|
|
def forward(self, x, y, edge_index):
|
|
x = self.fc1(x)
|
|
y = self.fc2(y)
|
|
z = torch.cat((x, y), dim=0)
|
|
z = F.relu(self.bn1(self.conv1(z, edge_index)))
|
|
z = self.conv2(z, edge_index)
|
|
return z
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gtypes = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy',
|
|
'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
|
|
'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
|
|
genres = {ix: i for ix, i in enumerate(gtypes)}
|
|
|
|
test_users = [327, 388, 404, 449, 707, 310, 605, 832, 850, 302, 523, 626, 774, 853, 522, 542, 680, 703, 929, 254, 526, 588, 884, 210, 275, 497, 507, 598, 825, 937, 311, 380, 448, 541, 885, 938, 409, 429, 433, 451, 534, 551, 585, 896, 33, 109, 120, 215, 261, 412, 425, 559, 615, 617, 829, 49, 78, 137, 192, 198, 281, 305, 394, 528, 669]
|
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
file_path = 'u.item'
|
|
df = pd.read_csv(file_path, sep='|', header=None, encoding='latin-1')
|
|
last_19_cols = df.columns[-19:]
|
|
genre_columns = [
|
|
'Unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
|
|
'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
|
|
'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
|
|
]
|
|
|
|
df.rename(columns=dict(zip(last_19_cols, genre_columns)), inplace=True)
|
|
df.rename(columns = {1: "info"}, inplace=True)
|
|
df['Year'] = df['info'].str.extract(r'\((\d{4})\)')
|
|
id_movie_map = df["info"].to_dict()
|
|
|
|
movie_lens = MovieLens100K('./data/movie_lens')[0]
|
|
movie_features = movie_lens["movie"]["x"]
|
|
user_features = movie_lens["user"]["x"]
|
|
data = movie_lens[("user", "rates", "movie")]
|
|
mask = data["rating"] >= 3
|
|
data_edge_index = data["edge_index"][:, mask]
|
|
data_edge_label = data["rating"][mask]
|
|
|
|
user_num_nodes = user_features.shape[0]
|
|
train_nodes, testing_nodes = train_test_split(range(user_num_nodes), test_size=0.2, random_state=42)
|
|
val_nodes, test_nodes = testing_nodes[:len(testing_nodes)//2], testing_nodes[len(testing_nodes)//2: ]
|
|
|
|
Y = data_edge_index[0]
|
|
val_mask = torch.isin(Y, torch.tensor(val_nodes))
|
|
val_edge_index = data_edge_index[:, val_mask]
|
|
test_mask = torch.isin(Y, torch.tensor(test_nodes))
|
|
test_edge_index = data_edge_index[:, test_mask]
|
|
|
|
user_features = user_features.to(device)
|
|
movie_features = movie_features.to(device)
|
|
|
|
val_edge_index = val_edge_index.to(device)
|
|
test_edge_index = test_edge_index.to(device)
|
|
|
|
val_edge_index[1] += (user_features.shape[0])
|
|
test_edge_index[1] += (user_features.shape[0])
|
|
|
|
model_type = "GCN"
|
|
model = GNN(model_type, in_channels= 32, hidden_channels=128, out_channels=64)
|
|
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
|
|
model=model.to(device)
|
|
|
|
model.load_state_dict(torch.load('model_GCN.pth', map_location=torch.device('cpu')))
|
|
|
|
model.eval()
|
|
with torch.no_grad():
|
|
embeddings = model(user_features, movie_features, test_edge_index)
|
|
|
|
cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
|
|
|
|
users = test_edge_index[0].unique()
|
|
|
|
def display_scores(top_k, test_edges_q_indices):
|
|
p_10 = sum(1 for i in top_k if i in test_edges_q_indices)
|
|
|
|
|
|
r_10 = p_10/len(test_edges_q_indices)
|
|
for rank, node in enumerate(top_k):
|
|
if node in test_edges_q_indices:
|
|
mrr = 1 / (rank + 1)
|
|
break
|
|
dcg = 0.0
|
|
for rank, node in enumerate(top_k, start=1):
|
|
if node in test_edges_q_indices:
|
|
dcg += 1 / np.log2(rank + 1)
|
|
ideal_relevant = min(len(test_edges_q_indices), 10)
|
|
idcg = sum(1 / np.log2(rank + 1) for rank in range(1, ideal_relevant + 1))
|
|
ndcg = dcg / idcg if idcg > 0 else 0.0
|
|
logging.info(f"Precision@10: {p_10}, Recall@10: {r_10}, MRR: {mrr}, nDCG: {ndcg}")
|
|
print(f"Precision@10: {p_10}, Recall@10: {r_10}, MRR: {mrr}, nDCG: {ndcg}")
|
|
|
|
def get_genres_movies_for_user(user_id):
|
|
curr_node = torch.tensor(user_id)
|
|
self_emb = embeddings[curr_node]
|
|
itm = embeddings[user_features.shape[0]:]
|
|
similarities = cos(self_emb, itm)
|
|
sorted_indices = torch.argsort(similarities, descending=True)
|
|
sorted_indices = [i + user_features.shape[0] for i in sorted_indices]
|
|
test_edges_q_indices = test_edge_index[1][test_edge_index[0] == curr_node]
|
|
top_k = sorted_indices[:10]
|
|
|
|
display_scores(top_k, test_edges_q_indices)
|
|
|
|
pred_mids = [(i - user_features.shape[0]).item() for i in top_k]
|
|
actual_mids = [(i - user_features.shape[0]).item() for i in test_edges_q_indices]
|
|
|
|
|
|
|
|
user_genre = {i:0 for i in range(len(genres))}
|
|
for amid in actual_mids:
|
|
for gr_id, g in genres.items():
|
|
|
|
if df.iloc[amid][g] == 1:
|
|
user_genre[gr_id] += 1
|
|
sort_user_genre = sorted(user_genre.items(), key=lambda item: item[1], reverse=True)
|
|
sort_user_genre = [i for i, _ in sort_user_genre]
|
|
top_genre_user = [genres[sort_user_genre[0]], genres[sort_user_genre[1]], genres[sort_user_genre[2]]]
|
|
|
|
our_movies = []
|
|
for pmid in pred_mids:
|
|
mname = df.iloc[pmid]["info"]
|
|
movie = {}
|
|
movie["title"] = mname
|
|
tmp = []
|
|
for gr_id, g in genres.items():
|
|
|
|
if df.iloc[pmid][g] == 1:
|
|
tmp.append(g)
|
|
movie["genres"] = tmp
|
|
our_movies.append(movie)
|
|
|
|
return top_genre_user, our_movies |