Andrea Maldonado commited on
Commit
96c9671
·
1 Parent(s): f4bce6d

Moves parameters to param_keys

Browse files
gedi/utils/io_helpers.py CHANGED
@@ -6,6 +6,7 @@ import re
6
  import shutil
7
  import numpy as np
8
  from collections import defaultdict
 
9
  from pathlib import PurePath
10
  from scipy.spatial.distance import euclidean
11
 
@@ -89,19 +90,19 @@ def normalize_value(value, min_val, max_val):
89
 
90
  def compute_similarity(v1, v2):
91
  feature_ranges = bpic_feature_values()
92
-
93
  # Convert all values to float except for the "log" key
94
  v1 = {k: (float(v) if k != "log" else v) for k, v in v1.items()}
95
  v2 = {k: (float(v) if k != "log" else v) for k, v in v2.items()}
96
-
97
  # Identify common numeric keys
98
  common_keys = set(v1.keys()).intersection(set(v2.keys()), set(feature_ranges.keys()))
99
  numeric_keys = [k for k in common_keys if isinstance(v1[k], (int, float)) and isinstance(v2[k], (int, float))]
100
-
101
  if not numeric_keys:
102
  print("[ERROR]: No common numeric keys found for similarity calculation.")
103
  return None
104
-
105
  # Normalize values and compute differences
106
  differences = []
107
  for key in numeric_keys:
@@ -109,115 +110,7 @@ def compute_similarity(v1, v2):
109
  norm_v1 = normalize_value(v1[key], min_val, max_val)
110
  norm_v2 = normalize_value(v2[key], min_val, max_val)
111
  differences.append(abs(norm_v1 - norm_v2))
112
-
113
  # Compute average difference as similarity metric
114
  target_similarity = 1 - np.mean(differences)
115
  return target_similarity
116
-
117
-
118
- def bpic_feature_values():
119
-
120
- data_dict = {
121
- "n_traces": [226.0, 251734.0],
122
- "n_variants": [6.0, 28457.0],
123
- "ratio_variants_per_number_of_traces": [0.0, 1.0],
124
- "trace_len_min": [1.0, 24.0],
125
- "trace_len_max": [1.0, 2973.0],
126
- "trace_len_mean": [1.0, 131.49],
127
- "trace_len_median": [1.0, 55.0],
128
- "trace_len_mode": [1.0, 61.0],
129
- "trace_len_std": [0.0, 202.53],
130
- "trace_len_variance": [0.0, 41017.89],
131
- "trace_len_q1": [1.0, 44.0],
132
- "trace_len_q3": [1.0, 169.0],
133
- "trace_len_iqr": [0.0, 161.0],
134
- "trace_len_geometric_mean": [1.0, 53.78],
135
- "trace_len_geometric_std": [1.0, 5.65],
136
- "trace_len_harmonic_mean": [1.0, 51.65],
137
- "trace_len_skewness": [-0.58, 111.97],
138
- "trace_len_kurtosis": [-0.97, 14006.75],
139
- "trace_len_coefficient_variation": [0.0, 4.74],
140
- "trace_len_entropy": [5.33, 12.04],
141
- "trace_len_hist1": [0.0, 1.99],
142
- "trace_len_hist2": [0.0, 0.42],
143
- "trace_len_hist3": [0.0, 0.4],
144
- "trace_len_hist4": [0.0, 0.19],
145
- "trace_len_hist5": [0.0, 0.14],
146
- "trace_len_hist6": [0.0, 10.0],
147
- "trace_len_hist7": [0.0, 0.02],
148
- "trace_len_hist8": [0.0, 0.04],
149
- "trace_len_hist9": [0.0, 0.0],
150
- "trace_len_hist10": [0.0, 2.7],
151
- "trace_len_skewness_hist": [-0.58, 111.97],
152
- "trace_len_kurtosis_hist": [-0.97, 14006.75],
153
- "ratio_most_common_variant": [0.0, 0.79],
154
- "ratio_top_1_variants": [0.0, 0.87],
155
- "ratio_top_5_variants": [0.0, 0.98],
156
- "ratio_top_10_variants": [0.0, 0.99],
157
- "ratio_top_20_variants": [0.2, 1.0],
158
- "ratio_top_50_variants": [0.5, 1.0],
159
- "ratio_top_75_variants": [0.75, 1.0],
160
- "mean_variant_occurrence": [1.0, 24500.67],
161
- "std_variant_occurrence": [0.04, 42344.04],
162
- "skewness_variant_occurrence": [1.54, 64.77],
163
- "kurtosis_variant_occurrence": [0.66, 5083.46],
164
- "n_unique_activities": [1.0, 1152.0],
165
- "activities_min": [1.0, 66058.0],
166
- "activities_max": [34.0, 466141.0],
167
- "activities_mean": [4.13, 66058.0],
168
- "activities_median": [2.0, 66058.0],
169
- "activities_std": [0.0, 120522.25],
170
- "activities_variance": [0.0, 14525612122.34],
171
- "activities_q1": [1.0, 66058.0],
172
- "activities_q3": [4.0, 79860.0],
173
- "activities_iqr": [0.0, 77290.0],
174
- "activities_skewness": [-0.06, 15.21],
175
- "activities_kurtosis": [-1.5, 315.84],
176
- "n_unique_start_activities": [1.0, 809.0],
177
- "start_activities_min": [1.0, 150370.0],
178
- "start_activities_max": [27.0, 199867.0],
179
- "start_activities_mean": [3.7, 150370.0],
180
- "start_activities_median": [1.0, 150370.0],
181
- "start_activities_std": [0.0, 65387.49],
182
- "start_activities_variance": [0.0, 4275524278.19],
183
- "start_activities_q1": [1.0, 150370.0],
184
- "start_activities_q3": [4.0, 150370.0],
185
- "start_activities_iqr": [0.0, 23387.25],
186
- "start_activities_skewness": [0.0, 9.3],
187
- "start_activities_kurtosis": [-2.0, 101.82],
188
- "n_unique_end_activities": [1.0, 757.0],
189
- "end_activities_min": [1.0, 16653.0],
190
- "end_activities_max": [28.0, 181328.0],
191
- "end_activities_mean": [3.53, 24500.67],
192
- "end_activities_median": [1.0, 16653.0],
193
- "end_activities_std": [0.0, 42344.04],
194
- "end_activities_variance": [0.0, 1793017566.89],
195
- "end_activities_q1": [1.0, 16653.0],
196
- "end_activities_q3": [3.0, 39876.0],
197
- "end_activities_iqr": [0.0, 39766.0],
198
- "end_activities_skewness": [-0.7, 13.82],
199
- "end_activities_kurtosis": [-2.0, 255.39],
200
- "eventropy_trace": [0.0, 13.36],
201
- "eventropy_prefix": [0.0, 16.77],
202
- "eventropy_global_block": [0.0, 24.71],
203
- "eventropy_lempel_ziv": [0.0, 685.0],
204
- "eventropy_k_block_diff_1": [-328.0, 962.0],
205
- "eventropy_k_block_diff_3": [0.0, 871.0],
206
- "eventropy_k_block_diff_5": [0.0, 881.0],
207
- "eventropy_k_block_ratio_1": [0.0, 935.0],
208
- "eventropy_k_block_ratio_3": [0.0, 7.11],
209
- "eventropy_k_block_ratio_5": [0.0, 7.11],
210
- "eventropy_knn_3": [0.0, 8.93],
211
- "eventropy_knn_5": [0.0, 648.0],
212
- "eventropy_knn_7": [0.0, 618.0],
213
- "epa_variant_entropy": [0.0, 11563842.15],
214
- "epa_normalized_variant_entropy": [0.0, 0.9],
215
- "epa_sequence_entropy": [0.0, 21146257.12],
216
- "epa_normalized_sequence_entropy": [0.0, 0.76],
217
- "epa_sequence_entropy_linear_forgetting": [0.0, 14140225.9],
218
- "epa_normalized_sequence_entropy_linear_forgetting": [0.0, 0.42],
219
- "epa_sequence_entropy_exponential_forgetting": [0.0, 15576076.83],
220
- "epa_normalized_sequence_entropy_exponential_forgetting": [0.0, 0.51]
221
- }
222
-
223
- return data_dict
 
6
  import shutil
7
  import numpy as np
8
  from collections import defaultdict
9
+ from gedi.utils.param_keys.features import bpic_feature_values
10
  from pathlib import PurePath
11
  from scipy.spatial.distance import euclidean
12
 
 
90
 
91
  def compute_similarity(v1, v2):
92
  feature_ranges = bpic_feature_values()
93
+
94
  # Convert all values to float except for the "log" key
95
  v1 = {k: (float(v) if k != "log" else v) for k, v in v1.items()}
96
  v2 = {k: (float(v) if k != "log" else v) for k, v in v2.items()}
97
+
98
  # Identify common numeric keys
99
  common_keys = set(v1.keys()).intersection(set(v2.keys()), set(feature_ranges.keys()))
100
  numeric_keys = [k for k in common_keys if isinstance(v1[k], (int, float)) and isinstance(v2[k], (int, float))]
101
+
102
  if not numeric_keys:
103
  print("[ERROR]: No common numeric keys found for similarity calculation.")
104
  return None
105
+
106
  # Normalize values and compute differences
107
  differences = []
108
  for key in numeric_keys:
 
110
  norm_v1 = normalize_value(v1[key], min_val, max_val)
111
  norm_v2 = normalize_value(v2[key], min_val, max_val)
112
  differences.append(abs(norm_v1 - norm_v2))
113
+
114
  # Compute average difference as similarity metric
115
  target_similarity = 1 - np.mean(differences)
116
  return target_similarity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gedi/utils/param_keys/features.py CHANGED
@@ -1,3 +1,109 @@
1
  # Features params
2
  FEATURE_PARAMS = 'feature_params'
3
  FEATURE_SET = 'feature_set'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Features params
2
  FEATURE_PARAMS = 'feature_params'
3
  FEATURE_SET = 'feature_set'
4
+
5
+ def bpic_feature_values():
6
+ data_dict = {
7
+ "n_traces": [226.0, 251734.0],
8
+ "n_variants": [6.0, 28457.0],
9
+ "ratio_variants_per_number_of_traces": [0.0, 1.0],
10
+ "trace_len_min": [1.0, 24.0],
11
+ "trace_len_max": [1.0, 2973.0],
12
+ "trace_len_mean": [1.0, 131.49],
13
+ "trace_len_median": [1.0, 55.0],
14
+ "trace_len_mode": [1.0, 61.0],
15
+ "trace_len_std": [0.0, 202.53],
16
+ "trace_len_variance": [0.0, 41017.89],
17
+ "trace_len_q1": [1.0, 44.0],
18
+ "trace_len_q3": [1.0, 169.0],
19
+ "trace_len_iqr": [0.0, 161.0],
20
+ "trace_len_geometric_mean": [1.0, 53.78],
21
+ "trace_len_geometric_std": [1.0, 5.65],
22
+ "trace_len_harmonic_mean": [1.0, 51.65],
23
+ "trace_len_skewness": [-0.58, 111.97],
24
+ "trace_len_kurtosis": [-0.97, 14006.75],
25
+ "trace_len_coefficient_variation": [0.0, 4.74],
26
+ "trace_len_entropy": [5.33, 12.04],
27
+ "trace_len_hist1": [0.0, 1.99],
28
+ "trace_len_hist2": [0.0, 0.42],
29
+ "trace_len_hist3": [0.0, 0.4],
30
+ "trace_len_hist4": [0.0, 0.19],
31
+ "trace_len_hist5": [0.0, 0.14],
32
+ "trace_len_hist6": [0.0, 10.0],
33
+ "trace_len_hist7": [0.0, 0.02],
34
+ "trace_len_hist8": [0.0, 0.04],
35
+ "trace_len_hist9": [0.0, 0.0],
36
+ "trace_len_hist10": [0.0, 2.7],
37
+ "trace_len_skewness_hist": [-0.58, 111.97],
38
+ "trace_len_kurtosis_hist": [-0.97, 14006.75],
39
+ "ratio_most_common_variant": [0.0, 0.79],
40
+ "ratio_top_1_variants": [0.0, 0.87],
41
+ "ratio_top_5_variants": [0.0, 0.98],
42
+ "ratio_top_10_variants": [0.0, 0.99],
43
+ "ratio_top_20_variants": [0.2, 1.0],
44
+ "ratio_top_50_variants": [0.5, 1.0],
45
+ "ratio_top_75_variants": [0.75, 1.0],
46
+ "mean_variant_occurrence": [1.0, 24500.67],
47
+ "std_variant_occurrence": [0.04, 42344.04],
48
+ "skewness_variant_occurrence": [1.54, 64.77],
49
+ "kurtosis_variant_occurrence": [0.66, 5083.46],
50
+ "n_unique_activities": [1.0, 1152.0],
51
+ "activities_min": [1.0, 66058.0],
52
+ "activities_max": [34.0, 466141.0],
53
+ "activities_mean": [4.13, 66058.0],
54
+ "activities_median": [2.0, 66058.0],
55
+ "activities_std": [0.0, 120522.25],
56
+ "activities_variance": [0.0, 14525612122.34],
57
+ "activities_q1": [1.0, 66058.0],
58
+ "activities_q3": [4.0, 79860.0],
59
+ "activities_iqr": [0.0, 77290.0],
60
+ "activities_skewness": [-0.06, 15.21],
61
+ "activities_kurtosis": [-1.5, 315.84],
62
+ "n_unique_start_activities": [1.0, 809.0],
63
+ "start_activities_min": [1.0, 150370.0],
64
+ "start_activities_max": [27.0, 199867.0],
65
+ "start_activities_mean": [3.7, 150370.0],
66
+ "start_activities_median": [1.0, 150370.0],
67
+ "start_activities_std": [0.0, 65387.49],
68
+ "start_activities_variance": [0.0, 4275524278.19],
69
+ "start_activities_q1": [1.0, 150370.0],
70
+ "start_activities_q3": [4.0, 150370.0],
71
+ "start_activities_iqr": [0.0, 23387.25],
72
+ "start_activities_skewness": [0.0, 9.3],
73
+ "start_activities_kurtosis": [-2.0, 101.82],
74
+ "n_unique_end_activities": [1.0, 757.0],
75
+ "end_activities_min": [1.0, 16653.0],
76
+ "end_activities_max": [28.0, 181328.0],
77
+ "end_activities_mean": [3.53, 24500.67],
78
+ "end_activities_median": [1.0, 16653.0],
79
+ "end_activities_std": [0.0, 42344.04],
80
+ "end_activities_variance": [0.0, 1793017566.89],
81
+ "end_activities_q1": [1.0, 16653.0],
82
+ "end_activities_q3": [3.0, 39876.0],
83
+ "end_activities_iqr": [0.0, 39766.0],
84
+ "end_activities_skewness": [-0.7, 13.82],
85
+ "end_activities_kurtosis": [-2.0, 255.39],
86
+ "eventropy_trace": [0.0, 13.36],
87
+ "eventropy_prefix": [0.0, 16.77],
88
+ "eventropy_global_block": [0.0, 24.71],
89
+ "eventropy_lempel_ziv": [0.0, 685.0],
90
+ "eventropy_k_block_diff_1": [-328.0, 962.0],
91
+ "eventropy_k_block_diff_3": [0.0, 871.0],
92
+ "eventropy_k_block_diff_5": [0.0, 881.0],
93
+ "eventropy_k_block_ratio_1": [0.0, 935.0],
94
+ "eventropy_k_block_ratio_3": [0.0, 7.11],
95
+ "eventropy_k_block_ratio_5": [0.0, 7.11],
96
+ "eventropy_knn_3": [0.0, 8.93],
97
+ "eventropy_knn_5": [0.0, 648.0],
98
+ "eventropy_knn_7": [0.0, 618.0],
99
+ "epa_variant_entropy": [0.0, 11563842.15],
100
+ "epa_normalized_variant_entropy": [0.0, 0.9],
101
+ "epa_sequence_entropy": [0.0, 21146257.12],
102
+ "epa_normalized_sequence_entropy": [0.0, 0.76],
103
+ "epa_sequence_entropy_linear_forgetting": [0.0, 14140225.9],
104
+ "epa_normalized_sequence_entropy_linear_forgetting": [0.0, 0.42],
105
+ "epa_sequence_entropy_exponential_forgetting": [0.0, 15576076.83],
106
+ "epa_normalized_sequence_entropy_exponential_forgetting": [0.0, 0.51]
107
+ }
108
+
109
+ return data_dict