Spaces:
Sleeping
Sleeping
File size: 12,291 Bytes
7a9a856 791ddea 7a9a856 913d18b 7a9a856 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 |
import pandas as pd
import numpy as np
from scipy import stats
import csv
def process_csv(input_file, output_file="output.csv",user_id_column="user_id",timestamp_column="timestamp",action_column="actions"):
with open(input_file, 'r', newline='') as csvfile, open(output_file, 'w', newline='') as new_csvfile:
reader = csv.reader(csvfile)
writer = csv.writer(new_csvfile)
# Get the column indices for user_id, timestamp, and action
user_id_index, timestamp_index, action_index = None, None, None
for i, row in enumerate(reader):
if i == 0: # Header row
user_id_index = row.index(user_id_column)
timestamp_index = row.index(timestamp_column)
action_index = row.index(action_column)
break
# Read the rest of the data and store it in a dictionary
user_actions = {}
for row in reader:
user_id = row[user_id_index]
timestamp = row[timestamp_index]
action = row[action_index]
if user_id not in user_actions:
user_actions[user_id] = []
# Sort actions by timestamp in ascending order
user_actions[user_id].append((timestamp, action))
# Write the data to the output CSV file with the new column
writer.writerow([user_id_column, 'actions']) # Write header with new column
for user_id, actions in user_actions.items():
sorted_actions = actions # Sort by timestamp
concatenated_actions = ';'.join([action for timestamp, action in sorted_actions])
writer.writerow([user_id, concatenated_actions])
def generate_sequence_list(sentence, min_gap,max_gap, sliding_window_min=1,sliding_window_max=1):
# Split the sentence into words
words = sentence.split(";")
# Generate n-grams
ngrams = []
for sliding_window in range(sliding_window_min,sliding_window_max+1):
for gram_length in range(min_gap,max_gap + 1): # Loop from 1 to n
for i in range(0, len(words) - gram_length + 1, sliding_window):
ngram = '--->'.join(words[i:i+gram_length])
ngrams.append(ngram)
return ngrams
def create_dict_from_df(df, identifier_column, sequence,min_gap,max_gap,sliding_window_min=1,sliding_window_max=1):
result_dict = {}
unique_values_set = set() # Initialize set to store unique values
for index, row in df.iterrows():
key = row[identifier_column]
values = generate_sequence_list(row[sequence],min_gap,max_gap,sliding_window_min,sliding_window_max)
result_dict[key] = values
unique_values_set.update(values) # Update the set with unique values
return result_dict, unique_values_set
def create_dataframe_from_dict_and_set(result_dict, unique_values_set):
# Initialize an empty dictionary to store counts
counts_dict = {}
# Iterate over the set
for value in unique_values_set:
counts_dict[value] = {}
# Iterate over the keys in the result_dict
for key, values in result_dict.items():
counts_dict[value][key] = values.count(value)
# Create a DataFrame from the counts dictionary
df = pd.DataFrame(counts_dict).fillna(0)
# Transpose the DataFrame so that keys become columns and values become rows
df = df.transpose()
return df
def process_dataframe(df):
# Calculate num_student
num_student = len(df.columns)
# Calculate I-Frequency and S-Frequency
I_Frequency = df.sum(axis=1)
S_Frequency = (df > 0).sum(axis=1)
# Create a dictionary for new data
new_data = {
'I-Frequency': I_Frequency,
'S-Frequency': S_Frequency
}
# Create a DataFrame from the new data
new_df = pd.DataFrame(new_data)
# Calculate I-Support by dividing I-Frequency with num_student
new_df['I-Support (mean)'] = new_df['I-Frequency'] / num_student
new_df['S-Support'] = new_df['S-Frequency'] / num_student
# Calculate standard deviation of each row
new_df['I-Support (sd)'] = df.std(axis=1,skipna=True)
return new_df
def calculate_p_value(test_type, vector_a, vector_b=None, **kwargs):
"""
Calculate the p-value for different types of t-tests.
Parameters:
test_type (str): Type of test to perform.
vector_a (array-like): Data for sample A.
vector_b (array-like, optional): Data for sample B (only required for some tests).
**kwargs: Additional keyword arguments required for specific tests.
Returns:
p_value (float): The p-value obtained from the test.
"""
if test_type == 'poisson_means_test':
# Poisson means test
result = stats.poisson_means_test(vector_a, vector_b, **kwargs)
elif test_type == 'ttest_ind':
# T-test for the means of two independent samples
result = stats.ttest_ind(vector_a, vector_b, **kwargs)
elif test_type == 'mannwhitneyu':
# Mann-Whitney U rank test on two independent samples
result = stats.mannwhitneyu(vector_a, vector_b, **kwargs)
elif test_type == 'bws_test':
# Baumgartner-Weiss-Schindler test on two independent samples
result = stats.bws_test(vector_a, vector_b, **kwargs)
elif test_type == 'ranksums':
# Wilcoxon rank-sum statistic for two samples
result = stats.ranksums(vector_a, vector_b, **kwargs)
elif test_type == 'brunnermunzel':
# Brunner-Munzel test on samples
result = stats.brunnermunzel(vector_a, vector_b, **kwargs)
elif test_type == 'mood':
# Mood's test for equal scale parameters
result = stats.mood(vector_a, vector_b, **kwargs)
elif test_type == 'ansari':
# Ansari-Bradley test for equal scale parameters
result = stats.ansari(vector_a, vector_b, **kwargs)
elif test_type == 'cramervonmises_2samp':
# Two-sample Cram�r-von Mises test for goodness of fit
result = stats.cramervonmises_2samp(vector_a, vector_b, **kwargs)
elif test_type == 'epps_singleton_2samp':
# Epps-Singleton (ES) test statistic
result = stats.epps_singleton_2samp(vector_a, vector_b, **kwargs)
elif test_type == 'ks_2samp':
# Two-sample Kolmogorov-Smirnov test for goodness of fit
result = stats.ks_2samp(vector_a, vector_b, **kwargs)
elif test_type == 'kstest':
# One-sample or two-sample Kolmogorov-Smirnov test for goodness of fit
result = stats.kstest(vector_a, vector_b, **kwargs)
else:
raise ValueError("Invalid test type.")
# Get the p-value
p_value = result.pvalue
return p_value
def SPM_(path_to_csv,dataset_format, identifier_column, sequence_column,sortby="S-Support",min_gap=1,max_gap=1,sliding_window_min=1,sliding_window_max=1,S_support_thresh=0,I_support_thresh=0,timestamp_column="timestamp"):
if dataset_format==1:
process_csv(path_to_csv, output_file="output.csv",user_id_column=identifier_column,timestamp_column=timestamp_column,action_column=sequence_column)
path_to_csv="output.csv"
# Read CSV file
data = pd.read_csv(path_to_csv)
# Create dictionary from DataFrame
data_seq, corpus = create_dict_from_df(data, identifier_column, sequence_column, min_gap,max_gap,sliding_window_min,sliding_window_max)
# Create occurrence matrix
occurence_matrix = create_dataframe_from_dict_and_set(data_seq, corpus)
# Process occurrence matrix
spm_result = process_dataframe(occurence_matrix)
spm_result = spm_result.sort_values(by=sortby, ascending=False)
return spm_result[(spm_result['S-Support'] > S_support_thresh) & (spm_result['I-Support (mean)'] > I_support_thresh)], occurence_matrix
def SPM(config):
path_to_csv = config.get('path_to_csv')
dataset_format = config.get('dataset_format')
identifier_column = config.get('identifier_column')
sequence_column = config.get('sequence_column')
sortby = config.get('sortby', "S-Support")
min_gap = config.get('min_gap', 1)
max_gap = config.get('max_gap', 1)
sliding_window_min = config.get('sliding_window_min', 1)
sliding_window_max = config.get('sliding_window_max', 1)
S_support_thresh = config.get('S_support_thresh', 0)
I_support_thresh = config.get('I_support_thresh', 0)
timestamp_column = config.get('timestamp_column', "timestamp")
return SPM_(path_to_csv,dataset_format, identifier_column, sequence_column,sortby,min_gap,max_gap,sliding_window_min,sliding_window_max,S_support_thresh,I_support_thresh,timestamp_column)
def DSM(config):
path_to_csv_left = config['path_to_csv_left']
dataset_format = config['dataset_format']
path_to_csv_right = config['path_to_csv_right']
identifier_column = config['identifier_column']
sequence_column = config['sequence_column']
sortby = config['sortby']
min_gap = config['min_gap']
max_gap = config['max_gap']
sliding_window_min = config['sliding_window_min']
sliding_window_max = config['sliding_window_max']
S_support_thresh = config['S_support_thresh']
I_support_thresh = config['I_support_thresh']
threshold_pvalue = config['threshold_pvalue']
test_type = config['test_type']
timestamp_column = config.get('timestamp_column', 'timestamp')
if dataset_format == 1:
process_csv(path_to_csv_left, output_file="output_left.csv", user_id_column=identifier_column,
timestamp_column=timestamp_column, action_column=sequence_column)
path_to_csv_left = "output_left.csv"
process_csv(path_to_csv_right, output_file="output_right.csv", user_id_column=identifier_column,
timestamp_column=timestamp_column, action_column=sequence_column)
path_to_csv_left = "output_right.csv"
ptrn_left = []
ptrn_right = []
ptrn_both_left = []
ptrn_both_right = []
spm_result_left, occurence_matrix_left = SPM_(path_to_csv_left, 0, identifier_column, sequence_column, sortby,
min_gap, max_gap, sliding_window_min, sliding_window_max,
S_support_thresh, I_support_thresh)
spm_result_right, occurence_matrix_right = SPM_(path_to_csv_right, 0, identifier_column, sequence_column, sortby,
min_gap, max_gap, sliding_window_min, sliding_window_max,
S_support_thresh, I_support_thresh)
result_data = []
all_ptrn = set(spm_result_left.index)
all_ptrn.update(spm_result_right.index)
left_ptrn_data = set(spm_result_left.index)
right_ptrn_data = set(spm_result_right.index)
for ptrn in all_ptrn:
isupport_left = occurence_matrix_left.loc[ptrn, :].values if ptrn in spm_result_left.index else np.zeros(
occurence_matrix_left.shape[1])
isupport_right = occurence_matrix_right.loc[ptrn, :].values if ptrn in spm_result_right.index else np.zeros(
occurence_matrix_right.shape[1])
p_value = calculate_p_value(test_type, isupport_left, isupport_right)
if p_value < threshold_pvalue:
if (ptrn in left_ptrn_data) and (ptrn in right_ptrn_data):
if isupport_left.mean() > isupport_right.mean():
ptrn_both_left.append(ptrn)
result_data.append((ptrn, p_value, isupport_left.mean(), isupport_right.mean(), "both_left"))
else:
ptrn_both_right.append(ptrn)
result_data.append((ptrn, p_value, isupport_left.mean(), isupport_right.mean(), "both_right"))
else:
if ptrn in left_ptrn_data:
ptrn_left.append(ptrn)
result_data.append((ptrn, p_value, isupport_left.mean(), np.nan, "left"))
else:
ptrn_right.append(ptrn)
result_data.append((ptrn, p_value, np.nan, isupport_right.mean(), "right"))
result_df = pd.DataFrame(result_data,
columns=['ptrn', 'ttest_value', 'isupportleft_mean', 'isupportright_mean', "Group"])
return ptrn_left, ptrn_right, ptrn_both_left, ptrn_both_right, result_df
|