Andrea Maldonado commited on
Commit
78d7948
·
1 Parent(s): 9b8cb42

Computes eucledean similarity

Browse files
data/validation/genELexperiment1_04_02.json CHANGED
@@ -1 +1 @@
1
- {"ratio_top_20_variants": 0.20017714791851196, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment1_04_02"}
 
1
+ {"ratio_top_20_variants": 0.20017714791851196, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment1_04_02", "target_similarity": 0.6520972056586477}
data/validation/genELexperiment2_07_04.json CHANGED
@@ -1 +1 @@
1
- {"ratio_top_20_variants": 0.38863337713534823, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment2_07_04"}
 
1
+ {"ratio_top_20_variants": 0.38863337713534823, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment2_07_04", "target_similarity": 0.3520969938410784}
data/validation/genELexperiment3_04_nan.json CHANGED
@@ -1 +1 @@
1
- {"epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment3_04_nan"}
 
1
+ {"epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment3_04_nan", "target_similarity": 0.6520972056586477}
data/validation/genELexperiment4_nan_02.json CHANGED
@@ -1 +1 @@
1
- {"ratio_top_20_variants": 0.2, "log": "genELexperiment4_nan_02"}
 
1
+ {"ratio_top_20_variants": 0.2, "log": "genELexperiment4_nan_02", "target_similarity": 1.0}
gedi/generator.py CHANGED
@@ -19,7 +19,7 @@ from pm4py.sim import play_out
19
  from smac import HyperparameterOptimizationFacade, Scenario
20
  from utils.param_keys import OUTPUT_PATH, INPUT_PATH
21
  from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, CONFIG_SPACE, N_TRIALS
22
- from gedi.utils.io_helpers import get_output_key_value_location, dump_features_json, calculate_manhattan_distance
23
  from gedi.utils.io_helpers import read_csvs
24
  import xml.etree.ElementTree as ET
25
  import re
@@ -222,7 +222,8 @@ class GenerateEventLogs():
222
  features_to_dump['ratio_variants_per_number_of_traces']=features_to_dump.pop('ratio_unique_traces_per_trace')
223
  features_to_dump['log']= os.path.split(save_path)[1].split(".")[0]
224
  # calculating the manhattan distance of the generated log to the target features
225
- features_to_dump['distance_to_target'] = calculate_manhattan_distance(self.objectives, features_to_dump)
 
226
  dump_features_json(features_to_dump, save_path)
227
 
228
  return log_config
 
19
  from smac import HyperparameterOptimizationFacade, Scenario
20
  from utils.param_keys import OUTPUT_PATH, INPUT_PATH
21
  from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, CONFIG_SPACE, N_TRIALS
22
+ from gedi.utils.io_helpers import get_output_key_value_location, dump_features_json, compute_similarity
23
  from gedi.utils.io_helpers import read_csvs
24
  import xml.etree.ElementTree as ET
25
  import re
 
222
  features_to_dump['ratio_variants_per_number_of_traces']=features_to_dump.pop('ratio_unique_traces_per_trace')
223
  features_to_dump['log']= os.path.split(save_path)[1].split(".")[0]
224
  # calculating the manhattan distance of the generated log to the target features
225
+ #features_to_dump['distance_to_target'] = calculate_manhattan_distance(self.objectives, features_to_dump)
226
+ features_to_dump['target_similarity'] = compute_similarity(self.objectives, features_to_dump)
227
  dump_features_json(features_to_dump, save_path)
228
 
229
  return log_config
gedi/utils/io_helpers.py CHANGED
@@ -7,6 +7,7 @@ import shutil
7
  import numpy as np
8
  from collections import defaultdict
9
  from pathlib import Path, PurePath
 
10
 
11
  def select_instance(source_dir, log_path, destination=os.path.join("output","generated","instance_selection")):
12
  os.makedirs(destination, exist_ok=True)
@@ -83,12 +84,12 @@ def dump_features_json(features: dict, output_path, content_type="features"):
83
  json.dump(features, fp, default=int)
84
  print(f"SUCCESS: Saved {len(features)-1} {content_type} in {json_path}")#-1 because 'log' is not a feature
85
 
86
- def calculate_manhattan_distance(v1, v2):
87
 
88
  # HOTFIX: Rename 'ratio_unique_traces_per_trace
89
  if 'ratio_unique_traces_per_trace' in v1:
90
  v1['ratio_variants_per_number_of_traces'] = v1.pop('ratio_unique_traces_per_trace')
91
-
92
  # Filter out non-numeric values and ensure the same keys exist in both dictionaries
93
  common_keys = set(v1.keys()).intersection(set(v2.keys()))
94
  numeric_keys = [k for k in common_keys if isinstance(v1[k], (int, float)) and isinstance(v2[k], (int, float))]
@@ -98,11 +99,12 @@ def calculate_manhattan_distance(v1, v2):
98
  vec2 = np.array([v2[k] for k in numeric_keys])
99
 
100
  if len(vec1) == 0 or len(vec2) == 0:
101
- print("[ERROR]: No common numeric keys found for (Manhattan) Distance calculation.")
102
  return None
103
 
104
  else:
105
- # Calculate Manhattan Distance
106
- manhattan_distance = np.sum(np.abs(vec1 - vec2))
 
107
 
108
- return manhattan_distance
 
7
  import numpy as np
8
  from collections import defaultdict
9
  from pathlib import Path, PurePath
10
+ from scipy.spatial.distance import euclidean
11
 
12
  def select_instance(source_dir, log_path, destination=os.path.join("output","generated","instance_selection")):
13
  os.makedirs(destination, exist_ok=True)
 
84
  json.dump(features, fp, default=int)
85
  print(f"SUCCESS: Saved {len(features)-1} {content_type} in {json_path}")#-1 because 'log' is not a feature
86
 
87
+ def compute_similarity(v1, v2):
88
 
89
  # HOTFIX: Rename 'ratio_unique_traces_per_trace
90
  if 'ratio_unique_traces_per_trace' in v1:
91
  v1['ratio_variants_per_number_of_traces'] = v1.pop('ratio_unique_traces_per_trace')
92
+
93
  # Filter out non-numeric values and ensure the same keys exist in both dictionaries
94
  common_keys = set(v1.keys()).intersection(set(v2.keys()))
95
  numeric_keys = [k for k in common_keys if isinstance(v1[k], (int, float)) and isinstance(v2[k], (int, float))]
 
99
  vec2 = np.array([v2[k] for k in numeric_keys])
100
 
101
  if len(vec1) == 0 or len(vec2) == 0:
102
+ print("[ERROR]: No common numeric keys found for (Edit) Distance calculation.")
103
  return None
104
 
105
  else:
106
+ # Calculate Euclidean Similarity
107
+ target_similarity = 1-euclidean(vec1, vec2)
108
+ #print("VECTORS: ", vec1, vec2, target_similarity)
109
 
110
+ return target_similarity