File size: 3,012 Bytes

9c58361


import pandas as pd
import random
import numpy as np
import pickle
from os                                 import makedirs
from os.path                            import exists
from gensim.models                      import Word2Vec, Doc2Vec
from glove 								import Glove
from sklearn.model_selection            import KFold

def _rnn_load(path, songs):
    data = pickle.load(open(path, 'rb'))
    emb_dict = {}
    for song in songs:
      emb_dict[song] = data[song]
    return emb_dict

def __w2v_load(path, songs):
    wv = Word2Vec.load(path).wv
    emb_dict = {}
    for song in songs:
      emb_dict[song] = wv[song]
    return emb_dict

def __g_load(path, songs):
    glove = Glove.load(path)
    emb_dict = {}
    for song in songs:
        emb_dict[song] = glove.word_vectors[glove.dictionary[song]]
    return emb_dict

def __load_exp(path, songs):
    data = pickle.load(open(path, 'rb'))
    return data


def get_embeddings(path, songs):
    path_arr        = path.split('/')
    session_file    = '/'.join(path_arr[:-1] + ['s' + path_arr[-1]])
    user_file       = path
    
    if 'experiments' in path:
        return __load_exp(user_file, songs), __load_exp(session_file, songs)
    if 'glove' in path:
        return __g_load(user_file, songs),__g_load(session_file, songs)
    if 'music2vec' in path:
        return __w2v_load(user_file, songs), __w2v_load(session_file, songs)
    if 'doc2vec' in path:
        return __w2v_load(user_file, songs), __w2v_load(session_file, songs)
    if 'rnn' in path:
        return _rnn_load(user_file, songs), _rnn_load(session_file, songs)
    return {},{} 

def prepare_data(df, conf):
    ds                  = conf['evaluation']['dataset']
    path_kfold          = 'tmp/{}/kfold/'.format(ds)
    if exists(path_kfold):
        kfold = []
        for i in range(0, conf['evaluation']['k']):
            j = i + 1
            train = pd.read_pickle(path_kfold + 'train_{}.pkl'.format(j))
            test  = pd.read_pickle(path_kfold + 'test_{}.pkl'.format(j))
            kfold.append((train, test))
        return kfold
    makedirs('tmp/{}/kfold/'.format(ds))
    sessions            = df.groupby('session')['song'].apply(lambda x: x.tolist())
    users               = df.groupby('user').agg(list)
    users['history']    = users['session'].apply(lambda x: [sessions[session] for session in list(set(x))])
    users               = users.drop(['song', 'timestamp','session'], axis=1)
    unique_users        = df.user.unique()
    kf                  = KFold(n_splits=conf['evaluation']['k'], shuffle=True)
    i       = 1
    kfold   = []
    for train, test in kf.split(unique_users):
        train_df = users[users.index.isin(unique_users[train])]
        test_df  = users[users.index.isin(unique_users[test])]
        train_df.to_pickle('tmp/{}/kfold/train_{}.pkl'.format(ds, i))
        test_df.to_pickle('tmp/{}/kfold/test_{}.pkl'.format(ds, i))
        kfold.append((train_df, test_df))
        i += 1 
    return kfold