Andrea MH commited on
Commit
35765eb
·
unverified ·
2 Parent(s): 6d970d7 08a0359

Merge pull request #12 from andreamalhera/11-debug-run-generation-for-f-triangle

Browse files
.conda.yml CHANGED
@@ -1,22 +1,156 @@
1
  name: gedi
2
  channels:
3
  - conda-forge
 
4
  dependencies:
5
- - python=3.9
6
- - numpy=1.23.1
7
- - scikit-learn=1.2.2
8
- - scipy
9
- - pandas
10
- - matplotlib
11
- - pip
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  - pip:
13
- - pm4py==2.7.2
14
- - imblearn
15
- - seaborn
16
- - feeed
17
- - smac
18
- - ConfigSpace==0.7.1
19
- - tqdm
20
- - Levenshtein
21
- - streamlit
22
- - streamlit-toggle-switch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  name: gedi
2
  channels:
3
  - conda-forge
4
+ - defaults
5
  dependencies:
6
+ - asttokens=2.4.1=pyhd8ed1ab_0
7
+ - backcall=0.2.0=pyh9f0ad1d_0
8
+ - certifi=2024.2.2=pyhd8ed1ab_0
9
+ - colorama=0.4.6=pyhd8ed1ab_0
10
+ - comm=0.2.2=pyhd8ed1ab_0
11
+ - cycler=0.12.1=pyhd8ed1ab_0
12
+ - decorator=5.1.1=pyhd8ed1ab_0
13
+ - executing=2.0.1=pyhd8ed1ab_0
14
+ - importlib-metadata=7.1.0=pyha770c72_0
15
+ - importlib-resources=6.4.0=pyhd8ed1ab_0
16
+ - importlib_metadata=7.1.0=hd8ed1ab_0
17
+ - importlib_resources=6.4.0=pyhd8ed1ab_0
18
+ - jedi=0.19.1=pyhd8ed1ab_0
19
+ - joblib=1.4.0=pyhd8ed1ab_0
20
+ - jupyter_client=8.6.1=pyhd8ed1ab_0
21
+ - matplotlib-inline=0.1.7=pyhd8ed1ab_0
22
+ - munkres=1.1.4=pyh9f0ad1d_0
23
+ - nest-asyncio=1.6.0=pyhd8ed1ab_0
24
+ - packaging=24.0=pyhd8ed1ab_0
25
+ - parso=0.8.4=pyhd8ed1ab_0
26
+ - pickleshare=0.7.5=py_1003
27
+ - pip=24.0=pyhd8ed1ab_0
28
+ - platformdirs=4.2.0=pyhd8ed1ab_0
29
+ - ply=3.11=pyhd8ed1ab_2
30
+ - prompt-toolkit=3.0.42=pyha770c72_0
31
+ - prompt_toolkit=3.0.42=hd8ed1ab_0
32
+ - pure_eval=0.2.2=pyhd8ed1ab_0
33
+ - pygments=2.17.2=pyhd8ed1ab_0
34
+ - pyparsing=3.1.2=pyhd8ed1ab_0
35
+ - python-dateutil=2.9.0=pyhd8ed1ab_0
36
+ - python-tzdata=2024.1=pyhd8ed1ab_0
37
+ - python_abi=3.9=4_cp39
38
+ - pytz=2024.1=pyhd8ed1ab_0
39
+ - setuptools=69.5.1=pyhd8ed1ab_0
40
+ - six=1.16.0=pyh6c4a22f_0
41
+ - stack_data=0.6.2=pyhd8ed1ab_0
42
+ - threadpoolctl=3.4.0=pyhc1e730c_0
43
+ - toml=0.10.2=pyhd8ed1ab_0
44
+ - tomli=2.0.1=pyhd8ed1ab_0
45
+ - traitlets=5.14.3=pyhd8ed1ab_0
46
+ - typing_extensions=4.11.0=pyha770c72_0
47
+ - tzdata=2024a=h0c530f3_0
48
+ - wcwidth=0.2.13=pyhd8ed1ab_0
49
+ - wheel=0.43.0=pyhd8ed1ab_1
50
+ - zipp=3.17.0=pyhd8ed1ab_0
51
  - pip:
52
+ - altair==5.3.0
53
+ - asttokens==2.4.1
54
+ - attrs==23.2.0
55
+ - backcall==0.2.0
56
+ - blinker==1.8.2
57
+ - brotli==1.1.0
58
+ - cachetools==5.4.0
59
+ - certifi==2024.2.2
60
+ - charset-normalizer==3.3.2
61
+ - click==8.1.7
62
+ - cloudpickle==3.0.0
63
+ - comm==0.2.2
64
+ - configspace==0.7.1
65
+ - contourpy==1.2.1
66
+ - cycler==0.12.1
67
+ - cvxopt==1.3.2
68
+ - dask==2024.4.1
69
+ - dask-jobqueue==0.8.5
70
+ - debugpy==1.8.1
71
+ - decorator==5.1.1
72
+ - deprecation==2.1.0
73
+ - distributed==2024.4.1
74
+ - emcee==3.1.4
75
+ - executing==2.0.1
76
+ - feeed==1.2.0
77
+ - fsspec==2024.3.1
78
+ - fonttools==4.51.0
79
+ - gitdb==4.0.11
80
+ - gitpython==3.1.43
81
+ - graphviz==0.20.3
82
+ - idna==3.7
83
+ - importlib-metadata==7.1.0
84
+ - importlib-resources==6.4.0
85
+ - imbalanced-learn==0.12.2
86
+ - imblearn==0.0
87
+ - intervaltree==3.1.0
88
+ - ipykernel==6.29.3
89
+ - ipython==8.12.0
90
+ - jedi==0.19.1
91
+ - jinja2==3.1.3
92
+ - joblib==1.4.0
93
+ - jsonschema==4.23.0
94
+ - jsonschema-specifications==2023.12.1
95
+ - jupyter_client==8.6.1
96
+ - jupyter_core==5.7.2
97
+ - kiwisolver==1.4.5
98
+ - levenshtein==0.23.0
99
+ - llvmlite==0.42.0
100
+ - locket==1.0.0
101
+ - lxml==5.2.1
102
+ - markdown-it-py==3.0.0
103
+ - markupsafe==2.1.5
104
+ - matplotlib==3.8.4
105
+ - matplotlib-inline==0.1.7
106
+ - mdurl==0.1.2
107
+ - more-itertools==10.2.0
108
+ - msgpack==1.0.8
109
+ - munkres==1.1.4
110
+ - networkx==3.2.1
111
+ - numba==0.59.1
112
+ - numpy==1.26.4
113
+ - opyenxes==0.3.0
114
+ - partd==1.4.1
115
+ - pandas==2.2.2
116
+ - pm4py==2.7.2
117
+ - protobuf==5.27.2
118
+ - pyarrow==17.0.0
119
+ - pydeck==0.9.1
120
+ - pydotplus==2.0.2
121
+ - pynisher==1.0.10
122
+ - pyrfr==0.9.0
123
+ - python-dateutil==2.9.0
124
+ - pyyaml==6.0.1
125
+ - rapidfuzz==3.8.1
126
+ - referencing==0.35.1
127
+ - regex==2023.12.25
128
+ - requests==2.32.3
129
+ - rich==13.7.1
130
+ - rpds-py==0.19.0
131
+ - seaborn==0.13.2
132
+ - scikit-learn==1.2.2
133
+ - scipy==1.13.0
134
+ - slicer==0.0.8
135
+ - smac==2.0.2
136
+ - smmap==5.0.1
137
+ - sortedcontainers==2.4.0
138
+ - stack_data==0.6.2
139
+ - streamlit==1.36.0
140
+ - stringdist==1.0.9
141
+ - tabulate==0.9.0
142
+ - tblib==3.0.0
143
+ - tenacity==8.5.0
144
+ - threadpoolctl==3.4.0
145
+ - toml==0.10.2
146
+ - tomli==2.0.1
147
+ - tornado==6.4
148
+ - tqdm==4.65.0
149
+ - toolz==0.12.1
150
+ - tzdata==2024.1
151
+ - urllib3==2.2.1
152
+ - watchdog==4.0.1
153
+ - xgboost==2.1.0
154
+ - zict==3.0.0
155
+ - zipp==3.17.0
156
+ - zstd==1.5.5.1
.github/workflows/test_gedi.yml CHANGED
@@ -69,6 +69,14 @@ jobs:
69
  run:
70
  diff data/validation/genELexperiment1_04_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment1_04_02.json
71
 
 
 
 
 
 
 
 
 
72
  test_benchmark:
73
  runs-on: ubuntu-latest
74
 
 
69
  run:
70
  diff data/validation/genELexperiment1_04_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment1_04_02.json
71
 
72
+ - name: Compare output 3
73
+ run:
74
+ diff data/validation/genELexperiment3_04.json output/features/grid_feat/2_enself_rt20v/genELexperiment3_04_nan.json
75
+
76
+ - name: Compare output 4
77
+ run:
78
+ diff data/validation/genELexperiment4_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment4_nan_02.json
79
+
80
  test_benchmark:
81
  runs-on: ubuntu-latest
82
 
data/test/grid_feat.csv CHANGED
@@ -1,3 +1,5 @@
1
  log,ratio_top_20_variants,epa_normalized_sequence_entropy_linear_forgetting
2
  experiment1,0.2,0.4
3
  experiment2,0.4,0.7
 
 
 
1
  log,ratio_top_20_variants,epa_normalized_sequence_entropy_linear_forgetting
2
  experiment1,0.2,0.4
3
  experiment2,0.4,0.7
4
+ experiment3,NaN,0.4
5
+ experiment4,0.2,NaN
data/validation/genELexperiment3_04.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "experiment3"}
data/validation/genELexperiment4_02.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"ratio_top_20_variants": 0.2, "log": "experiment4"}
gedi/benchmark.py CHANGED
@@ -109,7 +109,7 @@ class BenchmarkTest:
109
  results['log'] = log_name
110
 
111
  print(f" SUCCESS: {miner} miner for {results} took {dt.now()-start_miner} sec.")
112
- dump_features_json(results, dump_path, log_name, content_type="benchmark")
113
  return
114
 
115
  def split_miner_wrapper(self, log_path="data/real_event_logs/BPI_Challenges/BPI_Challenge_2012.xes"):
 
109
  results['log'] = log_name
110
 
111
  print(f" SUCCESS: {miner} miner for {results} took {dt.now()-start_miner} sec.")
112
+ dump_features_json(results, os.path.join(dump_path, log_name), content_type="benchmark")
113
  return
114
 
115
  def split_miner_wrapper(self, log_path="data/real_event_logs/BPI_Challenges/BPI_Challenge_2012.xes"):
gedi/features.py CHANGED
@@ -159,6 +159,6 @@ class EventLogFeatures(EventLogFile):
159
 
160
  identifier = file.rsplit(".", 1)[0]
161
  print(f" DONE: {file_path}. FEEED computed {feature_set}")
162
- dump_features_json(features, self.root_path, identifier)
163
  return features
164
 
 
159
 
160
  identifier = file.rsplit(".", 1)[0]
161
  print(f" DONE: {file_path}. FEEED computed {feature_set}")
162
+ dump_features_json(features, os.path.join(self.root_path,identifier))
163
  return features
164
 
gedi/generator.py CHANGED
@@ -19,7 +19,8 @@ from pm4py.sim import play_out
19
  from smac import HyperparameterOptimizationFacade, Scenario
20
  from utils.param_keys import OUTPUT_PATH, INPUT_PATH
21
  from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, CONFIG_SPACE, N_TRIALS
22
- from gedi.utils.io_helpers import get_output_key_value_location, dump_features_json, read_csvs
 
23
  import xml.etree.ElementTree as ET
24
  import re
25
  from xml.dom import minidom
@@ -80,7 +81,7 @@ def removeextralines(elem):
80
  element.tail=""
81
  if not re.search(hasWords,str(element.text)):
82
  element.text = ""
83
-
84
  def add_extension_before_traces(xes_file):
85
  # Register the namespace
86
  ET.register_namespace('', "http://www.xes-standard.org/")
@@ -158,12 +159,13 @@ class GenerateEventLogs():
158
  tasks=tasks.rename(columns={"ratio_variants_per_number_of_traces": "ratio_unique_traces_per_trace"})
159
 
160
  if tasks is not None:
 
161
  num_cores = multiprocessing.cpu_count() if len(tasks) >= multiprocessing.cpu_count() else len(tasks)
162
  #self.generator_wrapper([*tasks.iterrows()][0])# For testing
163
  with multiprocessing.Pool(num_cores) as p:
164
  print(f"INFO: Generator starting at {start.strftime('%H:%M:%S')} using {num_cores} cores for {len(tasks)} tasks...")
165
  random.seed(RANDOM_SEED)
166
- log_config = p.map(self.generator_wrapper, tasks.iterrows())
167
  self.log_config = log_config
168
 
169
  else:
@@ -192,7 +194,7 @@ class GenerateEventLogs():
192
  except IndexError:
193
  identifier = task[0]+1
194
  task = task[1].loc[lambda x, identifier=identifier: x!=identifier]
195
- self.objectives = task.to_dict()
196
  random.seed(RANDOM_SEED)
197
  self.configs = self.optimize()
198
 
@@ -207,8 +209,8 @@ class GenerateEventLogs():
207
  if self.objectives.get('ratio_unique_traces_per_trace'):#HOTFIX
208
  self.objectives['ratio_variants_per_number_of_traces']=self.objectives.pop('ratio_unique_traces_per_trace')
209
 
210
- save_path = get_output_key_value_location(self.objectives,
211
- self.output_path, identifier)+".xes"
212
 
213
  write_xes(log_config['log'], save_path)
214
  add_extension_before_traces(save_path)
@@ -219,7 +221,7 @@ class GenerateEventLogs():
219
  if features_to_dump.get('ratio_unique_traces_per_trace'):#HOTFIX
220
  features_to_dump['ratio_variants_per_number_of_traces']=features_to_dump.pop('ratio_unique_traces_per_trace')
221
  features_to_dump['log'] = identifier.replace('genEL', '')
222
- dump_features_json(features_to_dump, self.output_path, identifier, objectives=self.objectives)
223
  return log_config
224
 
225
  def generate_optimized_log(self, config):
 
19
  from smac import HyperparameterOptimizationFacade, Scenario
20
  from utils.param_keys import OUTPUT_PATH, INPUT_PATH
21
  from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, CONFIG_SPACE, N_TRIALS
22
+ from gedi.utils.io_helpers import get_output_key_value_location, dump_features_json
23
+ from gedi.utils.io_helpers import read_csvs
24
  import xml.etree.ElementTree as ET
25
  import re
26
  from xml.dom import minidom
 
81
  element.tail=""
82
  if not re.search(hasWords,str(element.text)):
83
  element.text = ""
84
+
85
  def add_extension_before_traces(xes_file):
86
  # Register the namespace
87
  ET.register_namespace('', "http://www.xes-standard.org/")
 
159
  tasks=tasks.rename(columns={"ratio_variants_per_number_of_traces": "ratio_unique_traces_per_trace"})
160
 
161
  if tasks is not None:
162
+ self.feature_keys = sorted([feature for feature in tasks.columns.tolist() if feature != "log"])
163
  num_cores = multiprocessing.cpu_count() if len(tasks) >= multiprocessing.cpu_count() else len(tasks)
164
  #self.generator_wrapper([*tasks.iterrows()][0])# For testing
165
  with multiprocessing.Pool(num_cores) as p:
166
  print(f"INFO: Generator starting at {start.strftime('%H:%M:%S')} using {num_cores} cores for {len(tasks)} tasks...")
167
  random.seed(RANDOM_SEED)
168
+ log_config = p.map(self.generator_wrapper, [(index, row) for index, row in tasks.iterrows()])
169
  self.log_config = log_config
170
 
171
  else:
 
194
  except IndexError:
195
  identifier = task[0]+1
196
  task = task[1].loc[lambda x, identifier=identifier: x!=identifier]
197
+ self.objectives = task.dropna().to_dict()
198
  random.seed(RANDOM_SEED)
199
  self.configs = self.optimize()
200
 
 
209
  if self.objectives.get('ratio_unique_traces_per_trace'):#HOTFIX
210
  self.objectives['ratio_variants_per_number_of_traces']=self.objectives.pop('ratio_unique_traces_per_trace')
211
 
212
+ save_path = get_output_key_value_location(task.to_dict(),
213
+ self.output_path, identifier, self.feature_keys)+".xes"
214
 
215
  write_xes(log_config['log'], save_path)
216
  add_extension_before_traces(save_path)
 
221
  if features_to_dump.get('ratio_unique_traces_per_trace'):#HOTFIX
222
  features_to_dump['ratio_variants_per_number_of_traces']=features_to_dump.pop('ratio_unique_traces_per_trace')
223
  features_to_dump['log'] = identifier.replace('genEL', '')
224
+ dump_features_json(features_to_dump, save_path)
225
  return log_config
226
 
227
  def generate_optimized_log(self, config):
gedi/utils/io_helpers.py CHANGED
@@ -52,9 +52,10 @@ def get_keys_abbreviation(obj_keys):
52
  abbreviated_keys.append(abbreviated_key)
53
  return '_'.join(abbreviated_keys)
54
 
55
- def get_output_key_value_location(obj, output_path, identifier):
56
  obj_sorted = dict(sorted(obj.items()))
57
- obj_keys = [*obj_sorted.keys()]
 
58
 
59
  obj_values = [round(x, 4) for x in [*obj_sorted.values()]]
60
 
@@ -71,15 +72,11 @@ def get_output_key_value_location(obj, output_path, identifier):
71
  save_path = os.path.join(folder_path, generated_file_name)
72
  return save_path
73
 
74
- def dump_features_json(features: dict, output_path, identifier, objectives=None, content_type="features"):
75
- output_parts = PurePath(output_path).parts
76
- feature_dir = os.path.join(output_parts[0], content_type,
77
  *output_parts[1:])
78
- if objectives is not None:
79
- json_path = get_output_key_value_location(objectives,
80
- feature_dir, identifier)+".json"
81
- else:
82
- json_path = os.path.join(feature_dir, identifier)+".json"
83
 
84
  os.makedirs(os.path.split(json_path)[0], exist_ok=True)
85
  with open(json_path, 'w') as fp:
 
52
  abbreviated_keys.append(abbreviated_key)
53
  return '_'.join(abbreviated_keys)
54
 
55
+ def get_output_key_value_location(obj, output_path, identifier, obj_keys=None):
56
  obj_sorted = dict(sorted(obj.items()))
57
+ if obj_keys is None:
58
+ obj_keys = [*obj_sorted.keys()]
59
 
60
  obj_values = [round(x, 4) for x in [*obj_sorted.values()]]
61
 
 
72
  save_path = os.path.join(folder_path, generated_file_name)
73
  return save_path
74
 
75
+ def dump_features_json(features: dict, output_path, content_type="features"):
76
+ output_parts = PurePath(output_path.split(".xes")[0]).parts
77
+ features_path = os.path.join(output_parts[0], content_type,
78
  *output_parts[1:])
79
+ json_path = features_path+'.json'
 
 
 
 
80
 
81
  os.makedirs(os.path.split(json_path)[0], exist_ok=True)
82
  with open(json_path, 'w') as fp:
utils/merge_jsons.py CHANGED
@@ -4,12 +4,12 @@ import csv
4
  import os
5
 
6
  """
7
- Run using:
8
  python merge_jsons.py path_to_your_json_directory output.csv
9
 
10
  """
11
  def json_to_csv(json_dir, output_csv):
12
-
13
  json_files = [os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')]
14
 
15
  # Collect data from all JSON files
@@ -18,13 +18,13 @@ def json_to_csv(json_dir, output_csv):
18
  with open(json_file, 'r') as f:
19
  data = json.load(f)
20
  all_data.append(data)
21
-
22
  # Extract the headers from the first JSON object
23
  if all_data:
24
- headers = all_data[0].keys()
25
  else:
26
  raise ValueError("No data found in JSON files")
27
-
28
  # Write data to CSV
29
  with open(output_csv, 'w', newline='') as f:
30
  writer = csv.DictWriter(f, fieldnames=headers)
 
4
  import os
5
 
6
  """
7
+ Run using:
8
  python merge_jsons.py path_to_your_json_directory output.csv
9
 
10
  """
11
  def json_to_csv(json_dir, output_csv):
12
+
13
  json_files = [os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')]
14
 
15
  # Collect data from all JSON files
 
18
  with open(json_file, 'r') as f:
19
  data = json.load(f)
20
  all_data.append(data)
21
+
22
  # Extract the headers from the first JSON object
23
  if all_data:
24
+ headers = {elem for s in [set(i) for i in [d.keys() for d in all_data]] for elem in s}
25
  else:
26
  raise ValueError("No data found in JSON files")
27
+
28
  # Write data to CSV
29
  with open(output_csv, 'w', newline='') as f:
30
  writer = csv.DictWriter(f, fieldnames=headers)