File size: 3,937 Bytes
9c58361 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import os
import numpy as np
import math
from sklearn.metrics.pairwise import cosine_similarity
import warnings
class Helper():
def __init__(self, train, test, songs, ds):
self.ds = ds
self.train = train
self.test = test
self.songs = songs
self.m2v_songs = self.songs.m2v.tolist()
self.sm2v_songs = self.songs.sm2v.tolist()
self.songs_ix = { v:k for k,v in enumerate(songs.index, 0) }
self.ix_songs = { k:v for k,v in enumerate(songs.index, 0) }
self.ix_users = { v:k for k,v in enumerate(np.concatenate([train.index.values, test.index.values]).tolist(), 0) }
self.num_users = len(self.ix_users)
self.num_songs = len(songs.index)
self.ix_pref = { v:self.u_pref(k) for (k,v) in self.ix_users.items() }
self.ix_u_songs = { v:self.unique_songs(k) for (k,v) in self.ix_users.items() }
def user_sessions(self, user):
history = self.test.loc[user, 'history']
return [(s[:len(s)//2], s[len(s)//2:]) for s in history]
def song_ix(self, song):
return self.songs_ix[song]
def ix_user(self, ix):
return self.ix_users[ix]
def unique_songs(self, user):
if user in self.train.index:
history = self.train[self.train.index == user]['history'].values[0]
if user in self.test.index:
history = self.test[self.test.index == user]['history'].values[0]
flat_history = [song for session in history for song in session]
unique_songs = list(set(flat_history))
return unique_songs
def u_pref(self, user):
if user in self.train.index:
history = self.train[self.train.index == user]['history'].values[0]
if user in self.test.index:
history = self.test[self.test.index == user]['history'].values[0]
history = [s[:len(s)//2] for s in history]
flat_history = [song for session in history for song in session]
flat_history = [self.songs.loc[song, 'm2v'] for song in flat_history]
mean = np.mean(flat_history, axis=0)
return mean
def c_pref(self, songs):
flat_vecs = self.songs.loc[songs, 'sm2v'].tolist()
return np.mean(np.array(flat_vecs), axis=0)
def get_n_largest(self, cos,n):
songs = self.songs.index.values
index = np.argpartition(cos, -n)[-n:]
return songs[index]
def uu_matrix(self):
if os.path.isfile('tmp/{}/matrix_users.npy'.format(self.ds)):
return np.load('tmp/{}/matrix_users.npy'.format(self.ds))
matrix_users = np.zeros((self.num_users, self.num_users))
for ix in range(self.num_users):
u_array = np.array([self.ix_pref[i] for i in range(self.num_users)])
y_array = np.zeros(self.num_users)
for j in range(self.num_users):
y_array[j] = math.sqrt(len(self.ix_u_songs[ix]) + len(self.ix_u_songs[j]))
cos = cosine_similarity(self.ix_pref[ix].reshape(1, -1), u_array)
val = np.sum([cos, y_array], axis=0)
matrix_users[ix] = np.divide(np.ones(val.shape), val)
np.save('tmp/{}/matrix_users'.format(self.ds), matrix_users)
return matrix_users
def us_matrix(self):
if os.path.isfile('tmp/{}/matrix_user_songs.npy'.format(self.ds)):
return np.load('tmp/{}/matrix_user_songs.npy'.format(self.ds))
matrix_u_songs = np.zeros((self.num_users, self.num_songs))
for u in list(self.ix_u_songs.keys()):
songs = self.ix_u_songs[u]
songs_ids = [self.songs_ix[s] for s in songs]
y_array = np.zeros(self.num_songs)
y_array[songs_ids] = 1
matrix_u_songs[u] = y_array
np.save('tmp/{}/matrix_user_songs'.format(self.ds), matrix_u_songs)
return matrix_u_songs
|