Aryasomayajula, Sai Anirudh [External] commited on
Commit
5282b6d
·
1 Parent(s): b10349b

Enhances similarity computation by normalizing feature values and adding feature range definitions

Browse files
Files changed (1) hide show
  1. gedi/utils/io_helpers.py +132 -19
gedi/utils/io_helpers.py CHANGED
@@ -84,27 +84,140 @@ def dump_features_json(features: dict, output_path, content_type="features"):
84
  json.dump(features, fp, default=int)
85
  print(f"SUCCESS: Saved {len(features)-1} {content_type} in {json_path}")#-1 because 'log' is not a feature
86
 
87
- def compute_similarity(v1, v2):
 
88
 
89
- # Convert all values to float except for the value for the key "Log"
 
 
 
90
  v1 = {k: (float(v) if k != "log" else v) for k, v in v1.items()}
91
  v2 = {k: (float(v) if k != "log" else v) for k, v in v2.items()}
92
-
93
- # Filter out non-numeric values and ensure the same keys exist in both dictionaries
94
- common_keys = set(v1.keys()).intersection(set(v2.keys()))
95
  numeric_keys = [k for k in common_keys if isinstance(v1[k], (int, float)) and isinstance(v2[k], (int, float))]
96
-
97
- # Create vectors from the filtered keys
98
- vec1 = np.array([v1[k] for k in numeric_keys])
99
- vec2 = np.array([v2[k] for k in numeric_keys])
100
-
101
- if len(vec1) == 0 or len(vec2) == 0:
102
- print("[ERROR]: No common numeric keys found for (Edit) Distance calculation.")
103
  return None
104
-
105
- else:
106
- # Calculate Euclidean Similarity
107
- target_similarity = 1 / (1 + euclidean(vec1, vec2))
108
- # print("VECTORS: ", vec1, vec2, target_similarity)
109
-
110
- return target_similarity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  json.dump(features, fp, default=int)
85
  print(f"SUCCESS: Saved {len(features)-1} {content_type} in {json_path}")#-1 because 'log' is not a feature
86
 
87
+ def normalize_value(value, min_val, max_val):
88
+ return (value - min_val) / (max_val - min_val) if max_val != min_val else 0.0
89
 
90
+ def compute_similarity(v1, v2):
91
+ feature_ranges = bpic_feature_values()
92
+
93
+ # Convert all values to float except for the "log" key
94
  v1 = {k: (float(v) if k != "log" else v) for k, v in v1.items()}
95
  v2 = {k: (float(v) if k != "log" else v) for k, v in v2.items()}
96
+
97
+ # Identify common numeric keys
98
+ common_keys = set(v1.keys()).intersection(set(v2.keys()), set(feature_ranges.keys()))
99
  numeric_keys = [k for k in common_keys if isinstance(v1[k], (int, float)) and isinstance(v2[k], (int, float))]
100
+
101
+ if not numeric_keys:
102
+ print("[ERROR]: No common numeric keys found for similarity calculation.")
 
 
 
 
103
  return None
104
+
105
+ # Normalize values and compute differences
106
+ differences = []
107
+ for key in numeric_keys:
108
+ min_val, max_val = feature_ranges[key]
109
+ norm_v1 = normalize_value(v1[key], min_val, max_val)
110
+ norm_v2 = normalize_value(v2[key], min_val, max_val)
111
+ differences.append(abs(norm_v1 - norm_v2))
112
+
113
+ # Compute average difference as similarity metric
114
+ target_similarity = 1 - np.mean(differences)
115
+ return target_similarity
116
+
117
+
118
+ def bpic_feature_values():
119
+
120
+ data_dict = {
121
+ "n_traces": [226.0, 251734.0],
122
+ "n_variants": [6.0, 28457.0],
123
+ "ratio_variants_per_number_of_traces": [0.0, 1.0],
124
+ "trace_len_min": [1.0, 24.0],
125
+ "trace_len_max": [1.0, 2973.0],
126
+ "trace_len_mean": [1.0, 131.49],
127
+ "trace_len_median": [1.0, 55.0],
128
+ "trace_len_mode": [1.0, 61.0],
129
+ "trace_len_std": [0.0, 202.53],
130
+ "trace_len_variance": [0.0, 41017.89],
131
+ "trace_len_q1": [1.0, 44.0],
132
+ "trace_len_q3": [1.0, 169.0],
133
+ "trace_len_iqr": [0.0, 161.0],
134
+ "trace_len_geometric_mean": [1.0, 53.78],
135
+ "trace_len_geometric_std": [1.0, 5.65],
136
+ "trace_len_harmonic_mean": [1.0, 51.65],
137
+ "trace_len_skewness": [-0.58, 111.97],
138
+ "trace_len_kurtosis": [-0.97, 14006.75],
139
+ "trace_len_coefficient_variation": [0.0, 4.74],
140
+ "trace_len_entropy": [5.33, 12.04],
141
+ "trace_len_hist1": [0.0, 1.99],
142
+ "trace_len_hist2": [0.0, 0.42],
143
+ "trace_len_hist3": [0.0, 0.4],
144
+ "trace_len_hist4": [0.0, 0.19],
145
+ "trace_len_hist5": [0.0, 0.14],
146
+ "trace_len_hist6": [0.0, 10.0],
147
+ "trace_len_hist7": [0.0, 0.02],
148
+ "trace_len_hist8": [0.0, 0.04],
149
+ "trace_len_hist9": [0.0, 0.0],
150
+ "trace_len_hist10": [0.0, 2.7],
151
+ "trace_len_skewness_hist": [-0.58, 111.97],
152
+ "trace_len_kurtosis_hist": [-0.97, 14006.75],
153
+ "ratio_most_common_variant": [0.0, 0.79],
154
+ "ratio_top_1_variants": [0.0, 0.87],
155
+ "ratio_top_5_variants": [0.0, 0.98],
156
+ "ratio_top_10_variants": [0.0, 0.99],
157
+ "ratio_top_20_variants": [0.2, 1.0],
158
+ "ratio_top_50_variants": [0.5, 1.0],
159
+ "ratio_top_75_variants": [0.75, 1.0],
160
+ "mean_variant_occurrence": [1.0, 24500.67],
161
+ "std_variant_occurrence": [0.04, 42344.04],
162
+ "skewness_variant_occurrence": [1.54, 64.77],
163
+ "kurtosis_variant_occurrence": [0.66, 5083.46],
164
+ "n_unique_activities": [1.0, 1152.0],
165
+ "activities_min": [1.0, 66058.0],
166
+ "activities_max": [34.0, 466141.0],
167
+ "activities_mean": [4.13, 66058.0],
168
+ "activities_median": [2.0, 66058.0],
169
+ "activities_std": [0.0, 120522.25],
170
+ "activities_variance": [0.0, 14525612122.34],
171
+ "activities_q1": [1.0, 66058.0],
172
+ "activities_q3": [4.0, 79860.0],
173
+ "activities_iqr": [0.0, 77290.0],
174
+ "activities_skewness": [-0.06, 15.21],
175
+ "activities_kurtosis": [-1.5, 315.84],
176
+ "n_unique_start_activities": [1.0, 809.0],
177
+ "start_activities_min": [1.0, 150370.0],
178
+ "start_activities_max": [27.0, 199867.0],
179
+ "start_activities_mean": [3.7, 150370.0],
180
+ "start_activities_median": [1.0, 150370.0],
181
+ "start_activities_std": [0.0, 65387.49],
182
+ "start_activities_variance": [0.0, 4275524278.19],
183
+ "start_activities_q1": [1.0, 150370.0],
184
+ "start_activities_q3": [4.0, 150370.0],
185
+ "start_activities_iqr": [0.0, 23387.25],
186
+ "start_activities_skewness": [0.0, 9.3],
187
+ "start_activities_kurtosis": [-2.0, 101.82],
188
+ "n_unique_end_activities": [1.0, 757.0],
189
+ "end_activities_min": [1.0, 16653.0],
190
+ "end_activities_max": [28.0, 181328.0],
191
+ "end_activities_mean": [3.53, 24500.67],
192
+ "end_activities_median": [1.0, 16653.0],
193
+ "end_activities_std": [0.0, 42344.04],
194
+ "end_activities_variance": [0.0, 1793017566.89],
195
+ "end_activities_q1": [1.0, 16653.0],
196
+ "end_activities_q3": [3.0, 39876.0],
197
+ "end_activities_iqr": [0.0, 39766.0],
198
+ "end_activities_skewness": [-0.7, 13.82],
199
+ "end_activities_kurtosis": [-2.0, 255.39],
200
+ "eventropy_trace": [0.0, 13.36],
201
+ "eventropy_prefix": [0.0, 16.77],
202
+ "eventropy_global_block": [0.0, 24.71],
203
+ "eventropy_lempel_ziv": [0.0, 685.0],
204
+ "eventropy_k_block_diff_1": [-328.0, 962.0],
205
+ "eventropy_k_block_diff_3": [0.0, 871.0],
206
+ "eventropy_k_block_diff_5": [0.0, 881.0],
207
+ "eventropy_k_block_ratio_1": [0.0, 935.0],
208
+ "eventropy_k_block_ratio_3": [0.0, 7.11],
209
+ "eventropy_k_block_ratio_5": [0.0, 7.11],
210
+ "eventropy_knn_3": [0.0, 8.93],
211
+ "eventropy_knn_5": [0.0, 648.0],
212
+ "eventropy_knn_7": [0.0, 618.0],
213
+ "epa_variant_entropy": [0.0, 11563842.15],
214
+ "epa_normalized_variant_entropy": [0.0, 0.9],
215
+ "epa_sequence_entropy": [0.0, 21146257.12],
216
+ "epa_normalized_sequence_entropy": [0.0, 0.76],
217
+ "epa_sequence_entropy_linear_forgetting": [0.0, 14140225.9],
218
+ "epa_normalized_sequence_entropy_linear_forgetting": [0.0, 0.42],
219
+ "epa_sequence_entropy_exponential_forgetting": [0.0, 15576076.83],
220
+ "epa_normalized_sequence_entropy_exponential_forgetting": [0.0, 0.51]
221
+ }
222
+
223
+ return data_dict