File size: 2,530 Bytes
9c58361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import sys
import csv
import os
import yaml
import pickle
import numpy                            as np
import pandas                           as pd
import project.evaluation.metrics       as m
from os.path                        	import exists
from project.data.preparation			import prepare_data, get_embeddings
from project.recsys.helper              import Helper
from datetime                           import datetime
from project.recsys.algorithms        	import execute_algo
from project.evaluation.ResultReport	import Results
from keras.models                   	import model_from_yaml

def get_rnn():
    model = model_from_yaml(open('training_model.yaml','r'))
    model.load_weights('training_weights.h5')
    return model

def skip_all(executed, params, k):
	folds = executed[executed['params'] == params]['folds']
	return folds.max() == k

def skip_fold(executed, params, fold):
	folds = executed[executed['params'] == params]['folds']
	return folds.max() >= fold

def cross_validation(df, conf, setups):
	params 			= conf['evaluation']
	r_paths			= conf['results']
	
	kfold			= prepare_data(df, conf)
	dataset 		= params['dataset']
	topN			= int(params['topN'])
	k				= int(params['k'])
	results 		= Results(setups, k)
	exec_path		= r_paths['full']
	pwd_rec 		= 'tmp/{}/rec/'.format(dataset)

	if not exists(pwd_rec):
		os.mkdir(pwd_rec)
	if not exists(exec_path):
		pd.DataFrame({},columns=['params','algo','folds','prec','rec','f1','map','ndcg@5','p@5']).to_csv(exec_path,index=None,sep='\t')

	executed = pd.read_csv(exec_path, sep='\t')

	for setup in setups:
		_, params, path	= setup
		if not exists(pwd_rec + params):
			os.mkdir(pwd_rec + params)
		if skip_all(executed, params, k):
			continue
		songs		= df['song'].unique().tolist()
		m2v, sm2v   = get_embeddings(path, songs)
		songs       = pd.DataFrame({ 'm2v': [m2v[x] for x in songs], 'sm2v': [sm2v[x] for x in songs]}, index=songs, columns=['m2v','sm2v'])
		fold = 1
		for train, test in kfold:
			if skip_fold(executed, params, fold):
				fold+=1
				continue
			time = datetime.now().strftime('%d/%m/%Y %H:%M')
			print('%s | fold-%d | Running recsys w/ k-fold with the following params: %s' % (time, fold, params))
			helper 	= Helper(train, test, songs, dataset)
			m2vTN, sm2vTN, csm2vTN, csm2vUK = execute_algo(train.index, test.index, songs, topN, k, helper, pwd_rec + params)
			res 							= results.fold_results(params, m2vTN, sm2vTN, csm2vTN, csm2vUK, fold)
			res.to_csv(exec_path, sep='\t', mode='a', index=None, header=None)
			fold+=1