Spaces:
Running
Running
Andrea Maldonado
commited on
Commit
·
96c9671
1
Parent(s):
f4bce6d
Moves parameters to param_keys
Browse files- gedi/utils/io_helpers.py +6 -113
- gedi/utils/param_keys/features.py +106 -0
gedi/utils/io_helpers.py
CHANGED
@@ -6,6 +6,7 @@ import re
|
|
6 |
import shutil
|
7 |
import numpy as np
|
8 |
from collections import defaultdict
|
|
|
9 |
from pathlib import PurePath
|
10 |
from scipy.spatial.distance import euclidean
|
11 |
|
@@ -89,19 +90,19 @@ def normalize_value(value, min_val, max_val):
|
|
89 |
|
90 |
def compute_similarity(v1, v2):
|
91 |
feature_ranges = bpic_feature_values()
|
92 |
-
|
93 |
# Convert all values to float except for the "log" key
|
94 |
v1 = {k: (float(v) if k != "log" else v) for k, v in v1.items()}
|
95 |
v2 = {k: (float(v) if k != "log" else v) for k, v in v2.items()}
|
96 |
-
|
97 |
# Identify common numeric keys
|
98 |
common_keys = set(v1.keys()).intersection(set(v2.keys()), set(feature_ranges.keys()))
|
99 |
numeric_keys = [k for k in common_keys if isinstance(v1[k], (int, float)) and isinstance(v2[k], (int, float))]
|
100 |
-
|
101 |
if not numeric_keys:
|
102 |
print("[ERROR]: No common numeric keys found for similarity calculation.")
|
103 |
return None
|
104 |
-
|
105 |
# Normalize values and compute differences
|
106 |
differences = []
|
107 |
for key in numeric_keys:
|
@@ -109,115 +110,7 @@ def compute_similarity(v1, v2):
|
|
109 |
norm_v1 = normalize_value(v1[key], min_val, max_val)
|
110 |
norm_v2 = normalize_value(v2[key], min_val, max_val)
|
111 |
differences.append(abs(norm_v1 - norm_v2))
|
112 |
-
|
113 |
# Compute average difference as similarity metric
|
114 |
target_similarity = 1 - np.mean(differences)
|
115 |
return target_similarity
|
116 |
-
|
117 |
-
|
118 |
-
def bpic_feature_values():
|
119 |
-
|
120 |
-
data_dict = {
|
121 |
-
"n_traces": [226.0, 251734.0],
|
122 |
-
"n_variants": [6.0, 28457.0],
|
123 |
-
"ratio_variants_per_number_of_traces": [0.0, 1.0],
|
124 |
-
"trace_len_min": [1.0, 24.0],
|
125 |
-
"trace_len_max": [1.0, 2973.0],
|
126 |
-
"trace_len_mean": [1.0, 131.49],
|
127 |
-
"trace_len_median": [1.0, 55.0],
|
128 |
-
"trace_len_mode": [1.0, 61.0],
|
129 |
-
"trace_len_std": [0.0, 202.53],
|
130 |
-
"trace_len_variance": [0.0, 41017.89],
|
131 |
-
"trace_len_q1": [1.0, 44.0],
|
132 |
-
"trace_len_q3": [1.0, 169.0],
|
133 |
-
"trace_len_iqr": [0.0, 161.0],
|
134 |
-
"trace_len_geometric_mean": [1.0, 53.78],
|
135 |
-
"trace_len_geometric_std": [1.0, 5.65],
|
136 |
-
"trace_len_harmonic_mean": [1.0, 51.65],
|
137 |
-
"trace_len_skewness": [-0.58, 111.97],
|
138 |
-
"trace_len_kurtosis": [-0.97, 14006.75],
|
139 |
-
"trace_len_coefficient_variation": [0.0, 4.74],
|
140 |
-
"trace_len_entropy": [5.33, 12.04],
|
141 |
-
"trace_len_hist1": [0.0, 1.99],
|
142 |
-
"trace_len_hist2": [0.0, 0.42],
|
143 |
-
"trace_len_hist3": [0.0, 0.4],
|
144 |
-
"trace_len_hist4": [0.0, 0.19],
|
145 |
-
"trace_len_hist5": [0.0, 0.14],
|
146 |
-
"trace_len_hist6": [0.0, 10.0],
|
147 |
-
"trace_len_hist7": [0.0, 0.02],
|
148 |
-
"trace_len_hist8": [0.0, 0.04],
|
149 |
-
"trace_len_hist9": [0.0, 0.0],
|
150 |
-
"trace_len_hist10": [0.0, 2.7],
|
151 |
-
"trace_len_skewness_hist": [-0.58, 111.97],
|
152 |
-
"trace_len_kurtosis_hist": [-0.97, 14006.75],
|
153 |
-
"ratio_most_common_variant": [0.0, 0.79],
|
154 |
-
"ratio_top_1_variants": [0.0, 0.87],
|
155 |
-
"ratio_top_5_variants": [0.0, 0.98],
|
156 |
-
"ratio_top_10_variants": [0.0, 0.99],
|
157 |
-
"ratio_top_20_variants": [0.2, 1.0],
|
158 |
-
"ratio_top_50_variants": [0.5, 1.0],
|
159 |
-
"ratio_top_75_variants": [0.75, 1.0],
|
160 |
-
"mean_variant_occurrence": [1.0, 24500.67],
|
161 |
-
"std_variant_occurrence": [0.04, 42344.04],
|
162 |
-
"skewness_variant_occurrence": [1.54, 64.77],
|
163 |
-
"kurtosis_variant_occurrence": [0.66, 5083.46],
|
164 |
-
"n_unique_activities": [1.0, 1152.0],
|
165 |
-
"activities_min": [1.0, 66058.0],
|
166 |
-
"activities_max": [34.0, 466141.0],
|
167 |
-
"activities_mean": [4.13, 66058.0],
|
168 |
-
"activities_median": [2.0, 66058.0],
|
169 |
-
"activities_std": [0.0, 120522.25],
|
170 |
-
"activities_variance": [0.0, 14525612122.34],
|
171 |
-
"activities_q1": [1.0, 66058.0],
|
172 |
-
"activities_q3": [4.0, 79860.0],
|
173 |
-
"activities_iqr": [0.0, 77290.0],
|
174 |
-
"activities_skewness": [-0.06, 15.21],
|
175 |
-
"activities_kurtosis": [-1.5, 315.84],
|
176 |
-
"n_unique_start_activities": [1.0, 809.0],
|
177 |
-
"start_activities_min": [1.0, 150370.0],
|
178 |
-
"start_activities_max": [27.0, 199867.0],
|
179 |
-
"start_activities_mean": [3.7, 150370.0],
|
180 |
-
"start_activities_median": [1.0, 150370.0],
|
181 |
-
"start_activities_std": [0.0, 65387.49],
|
182 |
-
"start_activities_variance": [0.0, 4275524278.19],
|
183 |
-
"start_activities_q1": [1.0, 150370.0],
|
184 |
-
"start_activities_q3": [4.0, 150370.0],
|
185 |
-
"start_activities_iqr": [0.0, 23387.25],
|
186 |
-
"start_activities_skewness": [0.0, 9.3],
|
187 |
-
"start_activities_kurtosis": [-2.0, 101.82],
|
188 |
-
"n_unique_end_activities": [1.0, 757.0],
|
189 |
-
"end_activities_min": [1.0, 16653.0],
|
190 |
-
"end_activities_max": [28.0, 181328.0],
|
191 |
-
"end_activities_mean": [3.53, 24500.67],
|
192 |
-
"end_activities_median": [1.0, 16653.0],
|
193 |
-
"end_activities_std": [0.0, 42344.04],
|
194 |
-
"end_activities_variance": [0.0, 1793017566.89],
|
195 |
-
"end_activities_q1": [1.0, 16653.0],
|
196 |
-
"end_activities_q3": [3.0, 39876.0],
|
197 |
-
"end_activities_iqr": [0.0, 39766.0],
|
198 |
-
"end_activities_skewness": [-0.7, 13.82],
|
199 |
-
"end_activities_kurtosis": [-2.0, 255.39],
|
200 |
-
"eventropy_trace": [0.0, 13.36],
|
201 |
-
"eventropy_prefix": [0.0, 16.77],
|
202 |
-
"eventropy_global_block": [0.0, 24.71],
|
203 |
-
"eventropy_lempel_ziv": [0.0, 685.0],
|
204 |
-
"eventropy_k_block_diff_1": [-328.0, 962.0],
|
205 |
-
"eventropy_k_block_diff_3": [0.0, 871.0],
|
206 |
-
"eventropy_k_block_diff_5": [0.0, 881.0],
|
207 |
-
"eventropy_k_block_ratio_1": [0.0, 935.0],
|
208 |
-
"eventropy_k_block_ratio_3": [0.0, 7.11],
|
209 |
-
"eventropy_k_block_ratio_5": [0.0, 7.11],
|
210 |
-
"eventropy_knn_3": [0.0, 8.93],
|
211 |
-
"eventropy_knn_5": [0.0, 648.0],
|
212 |
-
"eventropy_knn_7": [0.0, 618.0],
|
213 |
-
"epa_variant_entropy": [0.0, 11563842.15],
|
214 |
-
"epa_normalized_variant_entropy": [0.0, 0.9],
|
215 |
-
"epa_sequence_entropy": [0.0, 21146257.12],
|
216 |
-
"epa_normalized_sequence_entropy": [0.0, 0.76],
|
217 |
-
"epa_sequence_entropy_linear_forgetting": [0.0, 14140225.9],
|
218 |
-
"epa_normalized_sequence_entropy_linear_forgetting": [0.0, 0.42],
|
219 |
-
"epa_sequence_entropy_exponential_forgetting": [0.0, 15576076.83],
|
220 |
-
"epa_normalized_sequence_entropy_exponential_forgetting": [0.0, 0.51]
|
221 |
-
}
|
222 |
-
|
223 |
-
return data_dict
|
|
|
6 |
import shutil
|
7 |
import numpy as np
|
8 |
from collections import defaultdict
|
9 |
+
from gedi.utils.param_keys.features import bpic_feature_values
|
10 |
from pathlib import PurePath
|
11 |
from scipy.spatial.distance import euclidean
|
12 |
|
|
|
90 |
|
91 |
def compute_similarity(v1, v2):
|
92 |
feature_ranges = bpic_feature_values()
|
93 |
+
|
94 |
# Convert all values to float except for the "log" key
|
95 |
v1 = {k: (float(v) if k != "log" else v) for k, v in v1.items()}
|
96 |
v2 = {k: (float(v) if k != "log" else v) for k, v in v2.items()}
|
97 |
+
|
98 |
# Identify common numeric keys
|
99 |
common_keys = set(v1.keys()).intersection(set(v2.keys()), set(feature_ranges.keys()))
|
100 |
numeric_keys = [k for k in common_keys if isinstance(v1[k], (int, float)) and isinstance(v2[k], (int, float))]
|
101 |
+
|
102 |
if not numeric_keys:
|
103 |
print("[ERROR]: No common numeric keys found for similarity calculation.")
|
104 |
return None
|
105 |
+
|
106 |
# Normalize values and compute differences
|
107 |
differences = []
|
108 |
for key in numeric_keys:
|
|
|
110 |
norm_v1 = normalize_value(v1[key], min_val, max_val)
|
111 |
norm_v2 = normalize_value(v2[key], min_val, max_val)
|
112 |
differences.append(abs(norm_v1 - norm_v2))
|
113 |
+
|
114 |
# Compute average difference as similarity metric
|
115 |
target_similarity = 1 - np.mean(differences)
|
116 |
return target_similarity
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gedi/utils/param_keys/features.py
CHANGED
@@ -1,3 +1,109 @@
|
|
1 |
# Features params
|
2 |
FEATURE_PARAMS = 'feature_params'
|
3 |
FEATURE_SET = 'feature_set'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Features params
|
2 |
FEATURE_PARAMS = 'feature_params'
|
3 |
FEATURE_SET = 'feature_set'
|
4 |
+
|
5 |
+
def bpic_feature_values():
|
6 |
+
data_dict = {
|
7 |
+
"n_traces": [226.0, 251734.0],
|
8 |
+
"n_variants": [6.0, 28457.0],
|
9 |
+
"ratio_variants_per_number_of_traces": [0.0, 1.0],
|
10 |
+
"trace_len_min": [1.0, 24.0],
|
11 |
+
"trace_len_max": [1.0, 2973.0],
|
12 |
+
"trace_len_mean": [1.0, 131.49],
|
13 |
+
"trace_len_median": [1.0, 55.0],
|
14 |
+
"trace_len_mode": [1.0, 61.0],
|
15 |
+
"trace_len_std": [0.0, 202.53],
|
16 |
+
"trace_len_variance": [0.0, 41017.89],
|
17 |
+
"trace_len_q1": [1.0, 44.0],
|
18 |
+
"trace_len_q3": [1.0, 169.0],
|
19 |
+
"trace_len_iqr": [0.0, 161.0],
|
20 |
+
"trace_len_geometric_mean": [1.0, 53.78],
|
21 |
+
"trace_len_geometric_std": [1.0, 5.65],
|
22 |
+
"trace_len_harmonic_mean": [1.0, 51.65],
|
23 |
+
"trace_len_skewness": [-0.58, 111.97],
|
24 |
+
"trace_len_kurtosis": [-0.97, 14006.75],
|
25 |
+
"trace_len_coefficient_variation": [0.0, 4.74],
|
26 |
+
"trace_len_entropy": [5.33, 12.04],
|
27 |
+
"trace_len_hist1": [0.0, 1.99],
|
28 |
+
"trace_len_hist2": [0.0, 0.42],
|
29 |
+
"trace_len_hist3": [0.0, 0.4],
|
30 |
+
"trace_len_hist4": [0.0, 0.19],
|
31 |
+
"trace_len_hist5": [0.0, 0.14],
|
32 |
+
"trace_len_hist6": [0.0, 10.0],
|
33 |
+
"trace_len_hist7": [0.0, 0.02],
|
34 |
+
"trace_len_hist8": [0.0, 0.04],
|
35 |
+
"trace_len_hist9": [0.0, 0.0],
|
36 |
+
"trace_len_hist10": [0.0, 2.7],
|
37 |
+
"trace_len_skewness_hist": [-0.58, 111.97],
|
38 |
+
"trace_len_kurtosis_hist": [-0.97, 14006.75],
|
39 |
+
"ratio_most_common_variant": [0.0, 0.79],
|
40 |
+
"ratio_top_1_variants": [0.0, 0.87],
|
41 |
+
"ratio_top_5_variants": [0.0, 0.98],
|
42 |
+
"ratio_top_10_variants": [0.0, 0.99],
|
43 |
+
"ratio_top_20_variants": [0.2, 1.0],
|
44 |
+
"ratio_top_50_variants": [0.5, 1.0],
|
45 |
+
"ratio_top_75_variants": [0.75, 1.0],
|
46 |
+
"mean_variant_occurrence": [1.0, 24500.67],
|
47 |
+
"std_variant_occurrence": [0.04, 42344.04],
|
48 |
+
"skewness_variant_occurrence": [1.54, 64.77],
|
49 |
+
"kurtosis_variant_occurrence": [0.66, 5083.46],
|
50 |
+
"n_unique_activities": [1.0, 1152.0],
|
51 |
+
"activities_min": [1.0, 66058.0],
|
52 |
+
"activities_max": [34.0, 466141.0],
|
53 |
+
"activities_mean": [4.13, 66058.0],
|
54 |
+
"activities_median": [2.0, 66058.0],
|
55 |
+
"activities_std": [0.0, 120522.25],
|
56 |
+
"activities_variance": [0.0, 14525612122.34],
|
57 |
+
"activities_q1": [1.0, 66058.0],
|
58 |
+
"activities_q3": [4.0, 79860.0],
|
59 |
+
"activities_iqr": [0.0, 77290.0],
|
60 |
+
"activities_skewness": [-0.06, 15.21],
|
61 |
+
"activities_kurtosis": [-1.5, 315.84],
|
62 |
+
"n_unique_start_activities": [1.0, 809.0],
|
63 |
+
"start_activities_min": [1.0, 150370.0],
|
64 |
+
"start_activities_max": [27.0, 199867.0],
|
65 |
+
"start_activities_mean": [3.7, 150370.0],
|
66 |
+
"start_activities_median": [1.0, 150370.0],
|
67 |
+
"start_activities_std": [0.0, 65387.49],
|
68 |
+
"start_activities_variance": [0.0, 4275524278.19],
|
69 |
+
"start_activities_q1": [1.0, 150370.0],
|
70 |
+
"start_activities_q3": [4.0, 150370.0],
|
71 |
+
"start_activities_iqr": [0.0, 23387.25],
|
72 |
+
"start_activities_skewness": [0.0, 9.3],
|
73 |
+
"start_activities_kurtosis": [-2.0, 101.82],
|
74 |
+
"n_unique_end_activities": [1.0, 757.0],
|
75 |
+
"end_activities_min": [1.0, 16653.0],
|
76 |
+
"end_activities_max": [28.0, 181328.0],
|
77 |
+
"end_activities_mean": [3.53, 24500.67],
|
78 |
+
"end_activities_median": [1.0, 16653.0],
|
79 |
+
"end_activities_std": [0.0, 42344.04],
|
80 |
+
"end_activities_variance": [0.0, 1793017566.89],
|
81 |
+
"end_activities_q1": [1.0, 16653.0],
|
82 |
+
"end_activities_q3": [3.0, 39876.0],
|
83 |
+
"end_activities_iqr": [0.0, 39766.0],
|
84 |
+
"end_activities_skewness": [-0.7, 13.82],
|
85 |
+
"end_activities_kurtosis": [-2.0, 255.39],
|
86 |
+
"eventropy_trace": [0.0, 13.36],
|
87 |
+
"eventropy_prefix": [0.0, 16.77],
|
88 |
+
"eventropy_global_block": [0.0, 24.71],
|
89 |
+
"eventropy_lempel_ziv": [0.0, 685.0],
|
90 |
+
"eventropy_k_block_diff_1": [-328.0, 962.0],
|
91 |
+
"eventropy_k_block_diff_3": [0.0, 871.0],
|
92 |
+
"eventropy_k_block_diff_5": [0.0, 881.0],
|
93 |
+
"eventropy_k_block_ratio_1": [0.0, 935.0],
|
94 |
+
"eventropy_k_block_ratio_3": [0.0, 7.11],
|
95 |
+
"eventropy_k_block_ratio_5": [0.0, 7.11],
|
96 |
+
"eventropy_knn_3": [0.0, 8.93],
|
97 |
+
"eventropy_knn_5": [0.0, 648.0],
|
98 |
+
"eventropy_knn_7": [0.0, 618.0],
|
99 |
+
"epa_variant_entropy": [0.0, 11563842.15],
|
100 |
+
"epa_normalized_variant_entropy": [0.0, 0.9],
|
101 |
+
"epa_sequence_entropy": [0.0, 21146257.12],
|
102 |
+
"epa_normalized_sequence_entropy": [0.0, 0.76],
|
103 |
+
"epa_sequence_entropy_linear_forgetting": [0.0, 14140225.9],
|
104 |
+
"epa_normalized_sequence_entropy_linear_forgetting": [0.0, 0.42],
|
105 |
+
"epa_sequence_entropy_exponential_forgetting": [0.0, 15576076.83],
|
106 |
+
"epa_normalized_sequence_entropy_exponential_forgetting": [0.0, 0.51]
|
107 |
+
}
|
108 |
+
|
109 |
+
return data_dict
|