Igor Santana
rnn model sent from github to huggingface
9c58361
from os import path
import csv
import math
import json
import yaml
import numpy as np
import pandas as pd
import multiprocessing as mp
from datetime import datetime, timedelta
def remove_sessions(df, leq=1):
group = df.groupby(by='session').agg(list)
group = group['song'].apply(len)
to_stay = group[group > leq].index.values
return df[df.session.isin(to_stay)]
def sessionize_user(ds, session_time, s_path):
df = pd.read_csv('dataset/{}/listening_history.csv'.format(ds), sep = ',')
df['timestamp'] = df['timestamp'].astype('datetime64')
df['dif'] = df['timestamp'].diff()
df['session'] = df.apply(lambda x: 'NEW_SESSION' if x.dif >= timedelta(minutes=session_time) else 'SAME_SESSION', axis=1)
s_no = 0
l_u = ''
f = open(s_path, 'w+')
print(','.join(['user', 'song', 'timestamp', 'session']), file=f)
print('Sessionized "%s" data file: %s' % (ds, s_path))
for row in df.values:
if s_no == 0:
l_u = row[0]
if (row[4] == 'NEW_SESSION' and l_u == row[0]) or (l_u != row[0]):
s_no+=1
row[3] = 's{}'.format(s_no)
l_u = row[0]
row[2] = str(row[2])
print(','.join(row[:-1]), file=f)
def gen_seq_files(df, pwd, window_size):
c_sessions = df.groupby('session')['song'].agg(list)
u_sessions = df.groupby('user')['song'].agg(list)
num_w = window_size // 2
fc = open(pwd + 'c_seqs.csv', 'w+')
fu = open(pwd + 'u_seqs.csv', 'w+')
dict_song = {}
for session in c_sessions:
for ix in range(len(session)):
b4 = list(range(ix - num_w, ix))
af = list(range(ix + 1, ix + num_w + 1))
b4 = [session[i] if i >= 0 else '-' for i in b4]
af = [session[i] if i < len(session) else '-' for i in af]
if session[ix] not in dict_song:
dict_song[session[ix]] = []
dict_song[session[ix]].append(b4 + [session[ix]] + af)
for song, values in dict_song.items():
for seq in values:
print(song + '\t'+ '{}'.format(seq), file=fc)
dict_song = {}
for session in u_sessions:
for ix in range(len(session)):
b4 = list(range(ix - num_w, ix))
af = list(range(ix + 1, ix + num_w + 1))
b4 = [session[i] if i >= 0 else '-' for i in b4]
af = [session[i] if i < len(session) else '-' for i in af]
if session[ix] not in dict_song:
dict_song[session[ix]] = []
dict_song[session[ix]].append(b4 + [session[ix]] + af)
for song, values in dict_song.items():
for seq in values:
print(song + '\t'+ '{}'.format(seq), file=fu)
def preprocess(conf):
ds = conf['evaluation']['dataset']
interval = conf['session']['interval']
if path.exists('dataset/{}/session_listening_history.csv'.format(ds)):
print('The "%s" dataset is already sessionized' % ds)
return
print('Started to sessionize dataset "%s"' % ds)
sessionize_user(ds, interval, 'dataset/{}/session_listening_history.csv'.format(ds))