Andrea MH commited on
Commit
f744036
·
unverified ·
2 Parent(s): 885bc68 8735c43

Merge pull request #18 from andreamalhera/17-add-feature-similarity-to-json-output

Browse files
data/validation/genELexperiment1_04_02.json CHANGED
@@ -1 +1 @@
1
- {"ratio_top_20_variants": 0.20017714791851196, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment1_04_02"}
 
1
+ {"ratio_top_20_variants": 0.20017714791851196, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment1_04_02", "target_similarity": 0.6520971605578558}
data/validation/genELexperiment2_07_04.json CHANGED
@@ -1 +1 @@
1
- {"ratio_top_20_variants": 0.38863337713534823, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment2_07_04"}
 
1
+ {"ratio_top_20_variants": 0.38863337713534823, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment2_07_04", "target_similarity": 0.35199750692556764}
data/validation/genELexperiment3_04_nan.json CHANGED
@@ -1 +1 @@
1
- {"epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment3_04_nan"}
 
1
+ {"epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment3_04_nan", "target_similarity": 0.6520972056586477}
data/validation/genELexperiment4_nan_02.json CHANGED
@@ -1 +1 @@
1
- {"ratio_top_20_variants": 0.2, "log": "genELexperiment4_nan_02"}
 
1
+ {"ratio_top_20_variants": 0.2, "log": "genELexperiment4_nan_02", "target_similarity": 1.0}
gedi/generator.py CHANGED
@@ -19,7 +19,7 @@ from pm4py.sim import play_out
19
  from smac import HyperparameterOptimizationFacade, Scenario
20
  from utils.param_keys import OUTPUT_PATH, INPUT_PATH
21
  from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, CONFIG_SPACE, N_TRIALS
22
- from gedi.utils.io_helpers import get_output_key_value_location, dump_features_json
23
  from gedi.utils.io_helpers import read_csvs
24
  import xml.etree.ElementTree as ET
25
  import re
@@ -221,6 +221,9 @@ class GenerateEventLogs():
221
  if features_to_dump.get('ratio_unique_traces_per_trace'):#HOTFIX
222
  features_to_dump['ratio_variants_per_number_of_traces']=features_to_dump.pop('ratio_unique_traces_per_trace')
223
  features_to_dump['log']= os.path.split(save_path)[1].split(".")[0]
 
 
 
224
  dump_features_json(features_to_dump, save_path)
225
 
226
  return log_config
 
19
  from smac import HyperparameterOptimizationFacade, Scenario
20
  from utils.param_keys import OUTPUT_PATH, INPUT_PATH
21
  from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, CONFIG_SPACE, N_TRIALS
22
+ from gedi.utils.io_helpers import get_output_key_value_location, dump_features_json, compute_similarity
23
  from gedi.utils.io_helpers import read_csvs
24
  import xml.etree.ElementTree as ET
25
  import re
 
221
  if features_to_dump.get('ratio_unique_traces_per_trace'):#HOTFIX
222
  features_to_dump['ratio_variants_per_number_of_traces']=features_to_dump.pop('ratio_unique_traces_per_trace')
223
  features_to_dump['log']= os.path.split(save_path)[1].split(".")[0]
224
+ # calculating the manhattan distance of the generated log to the target features
225
+ #features_to_dump['distance_to_target'] = calculate_manhattan_distance(self.objectives, features_to_dump)
226
+ features_to_dump['target_similarity'] = compute_similarity(self.objectives, features_to_dump)
227
  dump_features_json(features_to_dump, save_path)
228
 
229
  return log_config
gedi/utils/io_helpers.py CHANGED
@@ -4,9 +4,10 @@ import os
4
  import pandas as pd
5
  import re
6
  import shutil
7
-
8
  from collections import defaultdict
9
  from pathlib import Path, PurePath
 
10
 
11
  def select_instance(source_dir, log_path, destination=os.path.join("output","generated","instance_selection")):
12
  os.makedirs(destination, exist_ok=True)
@@ -82,3 +83,28 @@ def dump_features_json(features: dict, output_path, content_type="features"):
82
  with open(json_path, 'w') as fp:
83
  json.dump(features, fp, default=int)
84
  print(f"SUCCESS: Saved {len(features)-1} {content_type} in {json_path}")#-1 because 'log' is not a feature
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import pandas as pd
5
  import re
6
  import shutil
7
+ import numpy as np
8
  from collections import defaultdict
9
  from pathlib import Path, PurePath
10
+ from scipy.spatial.distance import euclidean
11
 
12
  def select_instance(source_dir, log_path, destination=os.path.join("output","generated","instance_selection")):
13
  os.makedirs(destination, exist_ok=True)
 
83
  with open(json_path, 'w') as fp:
84
  json.dump(features, fp, default=int)
85
  print(f"SUCCESS: Saved {len(features)-1} {content_type} in {json_path}")#-1 because 'log' is not a feature
86
+
87
+ def compute_similarity(v1, v2):
88
+
89
+ # HOTFIX: Rename 'ratio_unique_traces_per_trace
90
+ if 'ratio_unique_traces_per_trace' in v1:
91
+ v1['ratio_variants_per_number_of_traces'] = v1.pop('ratio_unique_traces_per_trace')
92
+
93
+ # Filter out non-numeric values and ensure the same keys exist in both dictionaries
94
+ common_keys = set(v1.keys()).intersection(set(v2.keys()))
95
+ numeric_keys = [k for k in common_keys if isinstance(v1[k], (int, float)) and isinstance(v2[k], (int, float))]
96
+
97
+ # Create vectors from the filtered keys
98
+ vec1 = np.array([v1[k] for k in numeric_keys])
99
+ vec2 = np.array([v2[k] for k in numeric_keys])
100
+
101
+ if len(vec1) == 0 or len(vec2) == 0:
102
+ print("[ERROR]: No common numeric keys found for (Edit) Distance calculation.")
103
+ return None
104
+
105
+ else:
106
+ # Calculate Euclidean Similarity
107
+ target_similarity = 1-euclidean(vec1, vec2)
108
+ #print("VECTORS: ", vec1, vec2, target_similarity)
109
+
110
+ return target_similarity