Spaces:
Running
Running
Aryasomayajula, Sai Anirudh [External]
commited on
Commit
·
5282b6d
1
Parent(s):
b10349b
Enhances similarity computation by normalizing feature values and adding feature range definitions
Browse files- gedi/utils/io_helpers.py +132 -19
gedi/utils/io_helpers.py
CHANGED
@@ -84,27 +84,140 @@ def dump_features_json(features: dict, output_path, content_type="features"):
|
|
84 |
json.dump(features, fp, default=int)
|
85 |
print(f"SUCCESS: Saved {len(features)-1} {content_type} in {json_path}")#-1 because 'log' is not a feature
|
86 |
|
87 |
-
def
|
|
|
88 |
|
89 |
-
|
|
|
|
|
|
|
90 |
v1 = {k: (float(v) if k != "log" else v) for k, v in v1.items()}
|
91 |
v2 = {k: (float(v) if k != "log" else v) for k, v in v2.items()}
|
92 |
-
|
93 |
-
#
|
94 |
-
common_keys = set(v1.keys()).intersection(set(v2.keys()))
|
95 |
numeric_keys = [k for k in common_keys if isinstance(v1[k], (int, float)) and isinstance(v2[k], (int, float))]
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
vec2 = np.array([v2[k] for k in numeric_keys])
|
100 |
-
|
101 |
-
if len(vec1) == 0 or len(vec2) == 0:
|
102 |
-
print("[ERROR]: No common numeric keys found for (Edit) Distance calculation.")
|
103 |
return None
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
json.dump(features, fp, default=int)
|
85 |
print(f"SUCCESS: Saved {len(features)-1} {content_type} in {json_path}")#-1 because 'log' is not a feature
|
86 |
|
87 |
+
def normalize_value(value, min_val, max_val):
|
88 |
+
return (value - min_val) / (max_val - min_val) if max_val != min_val else 0.0
|
89 |
|
90 |
+
def compute_similarity(v1, v2):
|
91 |
+
feature_ranges = bpic_feature_values()
|
92 |
+
|
93 |
+
# Convert all values to float except for the "log" key
|
94 |
v1 = {k: (float(v) if k != "log" else v) for k, v in v1.items()}
|
95 |
v2 = {k: (float(v) if k != "log" else v) for k, v in v2.items()}
|
96 |
+
|
97 |
+
# Identify common numeric keys
|
98 |
+
common_keys = set(v1.keys()).intersection(set(v2.keys()), set(feature_ranges.keys()))
|
99 |
numeric_keys = [k for k in common_keys if isinstance(v1[k], (int, float)) and isinstance(v2[k], (int, float))]
|
100 |
+
|
101 |
+
if not numeric_keys:
|
102 |
+
print("[ERROR]: No common numeric keys found for similarity calculation.")
|
|
|
|
|
|
|
|
|
103 |
return None
|
104 |
+
|
105 |
+
# Normalize values and compute differences
|
106 |
+
differences = []
|
107 |
+
for key in numeric_keys:
|
108 |
+
min_val, max_val = feature_ranges[key]
|
109 |
+
norm_v1 = normalize_value(v1[key], min_val, max_val)
|
110 |
+
norm_v2 = normalize_value(v2[key], min_val, max_val)
|
111 |
+
differences.append(abs(norm_v1 - norm_v2))
|
112 |
+
|
113 |
+
# Compute average difference as similarity metric
|
114 |
+
target_similarity = 1 - np.mean(differences)
|
115 |
+
return target_similarity
|
116 |
+
|
117 |
+
|
118 |
+
def bpic_feature_values():
|
119 |
+
|
120 |
+
data_dict = {
|
121 |
+
"n_traces": [226.0, 251734.0],
|
122 |
+
"n_variants": [6.0, 28457.0],
|
123 |
+
"ratio_variants_per_number_of_traces": [0.0, 1.0],
|
124 |
+
"trace_len_min": [1.0, 24.0],
|
125 |
+
"trace_len_max": [1.0, 2973.0],
|
126 |
+
"trace_len_mean": [1.0, 131.49],
|
127 |
+
"trace_len_median": [1.0, 55.0],
|
128 |
+
"trace_len_mode": [1.0, 61.0],
|
129 |
+
"trace_len_std": [0.0, 202.53],
|
130 |
+
"trace_len_variance": [0.0, 41017.89],
|
131 |
+
"trace_len_q1": [1.0, 44.0],
|
132 |
+
"trace_len_q3": [1.0, 169.0],
|
133 |
+
"trace_len_iqr": [0.0, 161.0],
|
134 |
+
"trace_len_geometric_mean": [1.0, 53.78],
|
135 |
+
"trace_len_geometric_std": [1.0, 5.65],
|
136 |
+
"trace_len_harmonic_mean": [1.0, 51.65],
|
137 |
+
"trace_len_skewness": [-0.58, 111.97],
|
138 |
+
"trace_len_kurtosis": [-0.97, 14006.75],
|
139 |
+
"trace_len_coefficient_variation": [0.0, 4.74],
|
140 |
+
"trace_len_entropy": [5.33, 12.04],
|
141 |
+
"trace_len_hist1": [0.0, 1.99],
|
142 |
+
"trace_len_hist2": [0.0, 0.42],
|
143 |
+
"trace_len_hist3": [0.0, 0.4],
|
144 |
+
"trace_len_hist4": [0.0, 0.19],
|
145 |
+
"trace_len_hist5": [0.0, 0.14],
|
146 |
+
"trace_len_hist6": [0.0, 10.0],
|
147 |
+
"trace_len_hist7": [0.0, 0.02],
|
148 |
+
"trace_len_hist8": [0.0, 0.04],
|
149 |
+
"trace_len_hist9": [0.0, 0.0],
|
150 |
+
"trace_len_hist10": [0.0, 2.7],
|
151 |
+
"trace_len_skewness_hist": [-0.58, 111.97],
|
152 |
+
"trace_len_kurtosis_hist": [-0.97, 14006.75],
|
153 |
+
"ratio_most_common_variant": [0.0, 0.79],
|
154 |
+
"ratio_top_1_variants": [0.0, 0.87],
|
155 |
+
"ratio_top_5_variants": [0.0, 0.98],
|
156 |
+
"ratio_top_10_variants": [0.0, 0.99],
|
157 |
+
"ratio_top_20_variants": [0.2, 1.0],
|
158 |
+
"ratio_top_50_variants": [0.5, 1.0],
|
159 |
+
"ratio_top_75_variants": [0.75, 1.0],
|
160 |
+
"mean_variant_occurrence": [1.0, 24500.67],
|
161 |
+
"std_variant_occurrence": [0.04, 42344.04],
|
162 |
+
"skewness_variant_occurrence": [1.54, 64.77],
|
163 |
+
"kurtosis_variant_occurrence": [0.66, 5083.46],
|
164 |
+
"n_unique_activities": [1.0, 1152.0],
|
165 |
+
"activities_min": [1.0, 66058.0],
|
166 |
+
"activities_max": [34.0, 466141.0],
|
167 |
+
"activities_mean": [4.13, 66058.0],
|
168 |
+
"activities_median": [2.0, 66058.0],
|
169 |
+
"activities_std": [0.0, 120522.25],
|
170 |
+
"activities_variance": [0.0, 14525612122.34],
|
171 |
+
"activities_q1": [1.0, 66058.0],
|
172 |
+
"activities_q3": [4.0, 79860.0],
|
173 |
+
"activities_iqr": [0.0, 77290.0],
|
174 |
+
"activities_skewness": [-0.06, 15.21],
|
175 |
+
"activities_kurtosis": [-1.5, 315.84],
|
176 |
+
"n_unique_start_activities": [1.0, 809.0],
|
177 |
+
"start_activities_min": [1.0, 150370.0],
|
178 |
+
"start_activities_max": [27.0, 199867.0],
|
179 |
+
"start_activities_mean": [3.7, 150370.0],
|
180 |
+
"start_activities_median": [1.0, 150370.0],
|
181 |
+
"start_activities_std": [0.0, 65387.49],
|
182 |
+
"start_activities_variance": [0.0, 4275524278.19],
|
183 |
+
"start_activities_q1": [1.0, 150370.0],
|
184 |
+
"start_activities_q3": [4.0, 150370.0],
|
185 |
+
"start_activities_iqr": [0.0, 23387.25],
|
186 |
+
"start_activities_skewness": [0.0, 9.3],
|
187 |
+
"start_activities_kurtosis": [-2.0, 101.82],
|
188 |
+
"n_unique_end_activities": [1.0, 757.0],
|
189 |
+
"end_activities_min": [1.0, 16653.0],
|
190 |
+
"end_activities_max": [28.0, 181328.0],
|
191 |
+
"end_activities_mean": [3.53, 24500.67],
|
192 |
+
"end_activities_median": [1.0, 16653.0],
|
193 |
+
"end_activities_std": [0.0, 42344.04],
|
194 |
+
"end_activities_variance": [0.0, 1793017566.89],
|
195 |
+
"end_activities_q1": [1.0, 16653.0],
|
196 |
+
"end_activities_q3": [3.0, 39876.0],
|
197 |
+
"end_activities_iqr": [0.0, 39766.0],
|
198 |
+
"end_activities_skewness": [-0.7, 13.82],
|
199 |
+
"end_activities_kurtosis": [-2.0, 255.39],
|
200 |
+
"eventropy_trace": [0.0, 13.36],
|
201 |
+
"eventropy_prefix": [0.0, 16.77],
|
202 |
+
"eventropy_global_block": [0.0, 24.71],
|
203 |
+
"eventropy_lempel_ziv": [0.0, 685.0],
|
204 |
+
"eventropy_k_block_diff_1": [-328.0, 962.0],
|
205 |
+
"eventropy_k_block_diff_3": [0.0, 871.0],
|
206 |
+
"eventropy_k_block_diff_5": [0.0, 881.0],
|
207 |
+
"eventropy_k_block_ratio_1": [0.0, 935.0],
|
208 |
+
"eventropy_k_block_ratio_3": [0.0, 7.11],
|
209 |
+
"eventropy_k_block_ratio_5": [0.0, 7.11],
|
210 |
+
"eventropy_knn_3": [0.0, 8.93],
|
211 |
+
"eventropy_knn_5": [0.0, 648.0],
|
212 |
+
"eventropy_knn_7": [0.0, 618.0],
|
213 |
+
"epa_variant_entropy": [0.0, 11563842.15],
|
214 |
+
"epa_normalized_variant_entropy": [0.0, 0.9],
|
215 |
+
"epa_sequence_entropy": [0.0, 21146257.12],
|
216 |
+
"epa_normalized_sequence_entropy": [0.0, 0.76],
|
217 |
+
"epa_sequence_entropy_linear_forgetting": [0.0, 14140225.9],
|
218 |
+
"epa_normalized_sequence_entropy_linear_forgetting": [0.0, 0.42],
|
219 |
+
"epa_sequence_entropy_exponential_forgetting": [0.0, 15576076.83],
|
220 |
+
"epa_normalized_sequence_entropy_exponential_forgetting": [0.0, 0.51]
|
221 |
+
}
|
222 |
+
|
223 |
+
return data_dict
|