|
from os import path |
|
import csv |
|
import math |
|
import json |
|
import yaml |
|
import numpy as np |
|
import pandas as pd |
|
import multiprocessing as mp |
|
from datetime import datetime, timedelta |
|
|
|
def remove_sessions(df, leq=1): |
|
group = df.groupby(by='session').agg(list) |
|
group = group['song'].apply(len) |
|
to_stay = group[group > leq].index.values |
|
return df[df.session.isin(to_stay)] |
|
|
|
|
|
def sessionize_user(ds, session_time, s_path): |
|
df = pd.read_csv('dataset/{}/listening_history.csv'.format(ds), sep = ',') |
|
df['timestamp'] = df['timestamp'].astype('datetime64') |
|
df['dif'] = df['timestamp'].diff() |
|
df['session'] = df.apply(lambda x: 'NEW_SESSION' if x.dif >= timedelta(minutes=session_time) else 'SAME_SESSION', axis=1) |
|
s_no = 0 |
|
l_u = '' |
|
f = open(s_path, 'w+') |
|
print(','.join(['user', 'song', 'timestamp', 'session']), file=f) |
|
print('Sessionized "%s" data file: %s' % (ds, s_path)) |
|
for row in df.values: |
|
if s_no == 0: |
|
l_u = row[0] |
|
if (row[4] == 'NEW_SESSION' and l_u == row[0]) or (l_u != row[0]): |
|
s_no+=1 |
|
row[3] = 's{}'.format(s_no) |
|
l_u = row[0] |
|
row[2] = str(row[2]) |
|
print(','.join(row[:-1]), file=f) |
|
|
|
def gen_seq_files(df, pwd, window_size): |
|
c_sessions = df.groupby('session')['song'].agg(list) |
|
u_sessions = df.groupby('user')['song'].agg(list) |
|
num_w = window_size // 2 |
|
fc = open(pwd + 'c_seqs.csv', 'w+') |
|
fu = open(pwd + 'u_seqs.csv', 'w+') |
|
dict_song = {} |
|
for session in c_sessions: |
|
for ix in range(len(session)): |
|
b4 = list(range(ix - num_w, ix)) |
|
af = list(range(ix + 1, ix + num_w + 1)) |
|
b4 = [session[i] if i >= 0 else '-' for i in b4] |
|
af = [session[i] if i < len(session) else '-' for i in af] |
|
if session[ix] not in dict_song: |
|
dict_song[session[ix]] = [] |
|
dict_song[session[ix]].append(b4 + [session[ix]] + af) |
|
for song, values in dict_song.items(): |
|
for seq in values: |
|
print(song + '\t'+ '{}'.format(seq), file=fc) |
|
|
|
dict_song = {} |
|
for session in u_sessions: |
|
for ix in range(len(session)): |
|
b4 = list(range(ix - num_w, ix)) |
|
af = list(range(ix + 1, ix + num_w + 1)) |
|
b4 = [session[i] if i >= 0 else '-' for i in b4] |
|
af = [session[i] if i < len(session) else '-' for i in af] |
|
if session[ix] not in dict_song: |
|
dict_song[session[ix]] = [] |
|
dict_song[session[ix]].append(b4 + [session[ix]] + af) |
|
for song, values in dict_song.items(): |
|
for seq in values: |
|
print(song + '\t'+ '{}'.format(seq), file=fu) |
|
|
|
|
|
def preprocess(conf): |
|
ds = conf['evaluation']['dataset'] |
|
interval = conf['session']['interval'] |
|
if path.exists('dataset/{}/session_listening_history.csv'.format(ds)): |
|
print('The "%s" dataset is already sessionized' % ds) |
|
return |
|
print('Started to sessionize dataset "%s"' % ds) |
|
sessionize_user(ds, interval, 'dataset/{}/session_listening_history.csv'.format(ds)) |
|
|
|
|
|
|