|
import sys |
|
import csv |
|
import os |
|
import yaml |
|
import pickle |
|
import numpy as np |
|
import pandas as pd |
|
import project.evaluation.metrics as m |
|
from os.path import exists |
|
from project.data.preparation import prepare_data, get_embeddings |
|
from project.recsys.helper import Helper |
|
from datetime import datetime |
|
from project.recsys.algorithms import execute_algo |
|
from project.evaluation.ResultReport import Results |
|
from keras.models import model_from_yaml |
|
|
|
def get_rnn(): |
|
model = model_from_yaml(open('training_model.yaml','r')) |
|
model.load_weights('training_weights.h5') |
|
return model |
|
|
|
def skip_all(executed, params, k): |
|
folds = executed[executed['params'] == params]['folds'] |
|
return folds.max() == k |
|
|
|
def skip_fold(executed, params, fold): |
|
folds = executed[executed['params'] == params]['folds'] |
|
return folds.max() >= fold |
|
|
|
def cross_validation(df, conf, setups): |
|
params = conf['evaluation'] |
|
r_paths = conf['results'] |
|
|
|
kfold = prepare_data(df, conf) |
|
dataset = params['dataset'] |
|
topN = int(params['topN']) |
|
k = int(params['k']) |
|
results = Results(setups, k) |
|
exec_path = r_paths['full'] |
|
pwd_rec = 'tmp/{}/rec/'.format(dataset) |
|
|
|
if not exists(pwd_rec): |
|
os.mkdir(pwd_rec) |
|
if not exists(exec_path): |
|
pd.DataFrame({},columns=['params','algo','folds','prec','rec','f1','map','ndcg@5','p@5']).to_csv(exec_path,index=None,sep='\t') |
|
|
|
executed = pd.read_csv(exec_path, sep='\t') |
|
|
|
for setup in setups: |
|
_, params, path = setup |
|
if not exists(pwd_rec + params): |
|
os.mkdir(pwd_rec + params) |
|
if skip_all(executed, params, k): |
|
continue |
|
songs = df['song'].unique().tolist() |
|
m2v, sm2v = get_embeddings(path, songs) |
|
songs = pd.DataFrame({ 'm2v': [m2v[x] for x in songs], 'sm2v': [sm2v[x] for x in songs]}, index=songs, columns=['m2v','sm2v']) |
|
fold = 1 |
|
for train, test in kfold: |
|
if skip_fold(executed, params, fold): |
|
fold+=1 |
|
continue |
|
time = datetime.now().strftime('%d/%m/%Y %H:%M') |
|
print('%s | fold-%d | Running recsys w/ k-fold with the following params: %s' % (time, fold, params)) |
|
helper = Helper(train, test, songs, dataset) |
|
m2vTN, sm2vTN, csm2vTN, csm2vUK = execute_algo(train.index, test.index, songs, topN, k, helper, pwd_rec + params) |
|
res = results.fold_results(params, m2vTN, sm2vTN, csm2vTN, csm2vUK, fold) |
|
res.to_csv(exec_path, sep='\t', mode='a', index=None, header=None) |
|
fold+=1 |
|
|