Andrea MH commited on
Commit
35cb48f
·
unverified ·
2 Parent(s): 6b0f858 377d68e

Merge pull request #21 from andreamalhera/20-dashboard-run-gedi-from-dashboard

Browse files
.github/workflows/test_gedi.yml CHANGED
@@ -63,11 +63,11 @@ jobs:
63
 
64
  - name: Compare output 1
65
  run:
66
- diff data/validation/genELexperiment2_07_04.json output/features/grid_feat/2_enself_rt20v/genELexperiment2_07_04.json
67
 
68
  - name: Compare output 2
69
  run:
70
- diff data/validation/genELexperiment1_04_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment1_04_02.json
71
 
72
  - name: Compare output 3
73
  run:
 
63
 
64
  - name: Compare output 1
65
  run:
66
+ diff data/validation/genELexperiment1_04_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment1_04_02.json
67
 
68
  - name: Compare output 2
69
  run:
70
+ diff data/validation/genELexperiment2_07_04.json output/features/grid_feat/2_enself_rt20v/genELexperiment2_07_04.json
71
 
72
  - name: Compare output 3
73
  run:
config.py CHANGED
@@ -1,10 +1,8 @@
1
  import json
2
- import os
3
  import warnings
4
 
5
  from utils.param_keys import PIPELINE_STEP, INPUT_PATH, OUTPUT_PATH
6
  from utils.param_keys.features import FEATURE_SET, FEATURE_PARAMS
7
- from tqdm import tqdm
8
 
9
  def get_model_params_list(alg_json_file: str) :#-> list[dict]:
10
  """
 
1
  import json
 
2
  import warnings
3
 
4
  from utils.param_keys import PIPELINE_STEP, INPUT_PATH, OUTPUT_PATH
5
  from utils.param_keys.features import FEATURE_SET, FEATURE_PARAMS
 
6
 
7
  def get_model_params_list(alg_json_file: str) :#-> list[dict]:
8
  """
config_files/config_layout.json CHANGED
@@ -9,7 +9,7 @@
9
  {
10
  "pipeline_step": "event_logs_generation",
11
  "output_path": "output/features/2_bpic_features/2_ense_rmcv_feat.csv",
12
- "output_path": "data/test",
13
  "generator_params": {
14
  "experiment": "data/grid_objectives.csv",
15
  "experiment": {"input_path": "data/2_bpic_features.csv",
 
9
  {
10
  "pipeline_step": "event_logs_generation",
11
  "output_path": "output/features/2_bpic_features/2_ense_rmcv_feat.csv",
12
+ "output_path": "data/frontend/test",
13
  "generator_params": {
14
  "experiment": "data/grid_objectives.csv",
15
  "experiment": {"input_path": "data/2_bpic_features.csv",
data/validation/genELexperiment1_04_02.json CHANGED
@@ -1 +1 @@
1
- {"ratio_top_20_variants": 0.20017714791851196, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment1_04_02", "target_similarity": 0.6520971605578558}
 
1
+ {"ratio_top_20_variants": 0.20017714791851196, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment1_04_02", "target_similarity": 0.7418932364693804}
data/validation/genELexperiment2_07_04.json CHANGED
@@ -1 +1 @@
1
- {"ratio_top_20_variants": 0.38863337713534823, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment2_07_04", "target_similarity": 0.35199750692556764}
 
1
+ {"ratio_top_20_variants": 0.38863337713534823, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment2_07_04", "target_similarity": 0.6067951985524301}
data/validation/genELexperiment3_04_nan.json CHANGED
@@ -1 +1 @@
1
- {"epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment3_04_nan", "target_similarity": 0.6520972056586477}
 
1
+ {"epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment3_04_nan", "target_similarity": 0.7418932612931086}
gedi/augmentation.py CHANGED
@@ -1,8 +1,7 @@
1
  import pandas as pd
2
  from collections import Counter
3
  from datetime import datetime as dt
4
- from imblearn.over_sampling import SMOTE, SVMSMOTE, BorderlineSMOTE, KMeansSMOTE
5
- from sklearn.preprocessing import Normalizer
6
  from gedi.utils.matrix_tools import insert_missing_data
7
  from utils.param_keys import INPUT_PATH, OUTPUT_PATH
8
  from utils.param_keys.augmentation import AUGMENTATION_PARAMS, NO_SAMPLES, FEATURE_SELECTION, METHOD
 
1
  import pandas as pd
2
  from collections import Counter
3
  from datetime import datetime as dt
4
+ from imblearn.over_sampling import SMOTE
 
5
  from gedi.utils.matrix_tools import insert_missing_data
6
  from utils.param_keys import INPUT_PATH, OUTPUT_PATH
7
  from utils.param_keys.augmentation import AUGMENTATION_PARAMS, NO_SAMPLES, FEATURE_SELECTION, METHOD
gedi/benchmark.py CHANGED
@@ -5,15 +5,12 @@ import pandas as pd
5
  import subprocess
6
 
7
  from datetime import datetime as dt
8
- from functools import partial, partialmethod
9
  from itertools import repeat
10
- from pathlib import Path
11
- from pm4py import read_xes, convert_to_bpmn, read_bpmn, convert_to_petri_net, check_soundness
12
  from pm4py import discover_petri_net_inductive, discover_petri_net_ilp, discover_petri_net_heuristics
13
- from pm4py import fitness_alignments, fitness_token_based_replay
14
- from pm4py import precision_alignments, precision_token_based_replay
15
- from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator
16
- from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator
17
  from pm4py.objects.bpmn.obj import BPMN
18
  from pm4py.objects.log.importer.xes import importer as xes_importer
19
  from gedi.utils.io_helpers import dump_features_json
 
5
  import subprocess
6
 
7
  from datetime import datetime as dt
8
+ from functools import partialmethod
9
  from itertools import repeat
10
+ from pm4py import convert_to_bpmn, read_bpmn, convert_to_petri_net, check_soundness
 
11
  from pm4py import discover_petri_net_inductive, discover_petri_net_ilp, discover_petri_net_heuristics
12
+ from pm4py import fitness_alignments
13
+ from pm4py import precision_alignments
 
 
14
  from pm4py.objects.bpmn.obj import BPMN
15
  from pm4py.objects.log.importer.xes import importer as xes_importer
16
  from gedi.utils.io_helpers import dump_features_json
gedi/features.py CHANGED
@@ -1,14 +1,12 @@
1
  import json
2
  import multiprocessing
3
- import numpy as np
4
  import pandas as pd
5
  import os
6
 
7
  from datetime import datetime as dt
8
  from functools import partial
9
  from feeed.feature_extractor import extract_features
10
- from pathlib import Path, PurePath
11
- from sklearn.impute import SimpleImputer
12
  from utils.param_keys import INPUT_PATH
13
  from utils.param_keys.features import FEATURE_PARAMS, FEATURE_SET
14
  from gedi.utils.io_helpers import dump_features_json
 
1
  import json
2
  import multiprocessing
 
3
  import pandas as pd
4
  import os
5
 
6
  from datetime import datetime as dt
7
  from functools import partial
8
  from feeed.feature_extractor import extract_features
9
+ from pathlib import Path
 
10
  from utils.param_keys import INPUT_PATH
11
  from utils.param_keys.features import FEATURE_PARAMS, FEATURE_SET
12
  from gedi.utils.io_helpers import dump_features_json
gedi/plotter.py CHANGED
@@ -12,14 +12,13 @@ from matplotlib.axes import Axes
12
  from matplotlib.figure import Figure
13
  from matplotlib.lines import Line2D
14
  from utils.param_keys import PLOT_TYPE, PROJECTION, EXPLAINED_VAR, PLOT_3D_MAP
15
- from utils.param_keys import INPUT_PATH, OUTPUT_PATH, PIPELINE_STEP
16
  from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, PLOT_REFERENCE_FEATURE
17
  from utils.param_keys.plotter import REAL_EVENTLOG_PATH, FONT_SIZE, BOXPLOT_WIDTH
18
  from collections import defaultdict
19
 
20
  from sklearn.preprocessing import Normalizer, StandardScaler
21
  from sklearn.decomposition import PCA
22
- from sklearn.metrics.pairwise import euclidean_distances
23
  from gedi.generator import get_tasks
24
  from gedi.utils.io_helpers import get_keys_abbreviation
25
  from gedi.utils.io_helpers import read_csvs, select_instance
 
12
  from matplotlib.figure import Figure
13
  from matplotlib.lines import Line2D
14
  from utils.param_keys import PLOT_TYPE, PROJECTION, EXPLAINED_VAR, PLOT_3D_MAP
15
+ from utils.param_keys import OUTPUT_PATH, PIPELINE_STEP
16
  from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, PLOT_REFERENCE_FEATURE
17
  from utils.param_keys.plotter import REAL_EVENTLOG_PATH, FONT_SIZE, BOXPLOT_WIDTH
18
  from collections import defaultdict
19
 
20
  from sklearn.preprocessing import Normalizer, StandardScaler
21
  from sklearn.decomposition import PCA
 
22
  from gedi.generator import get_tasks
23
  from gedi.utils.io_helpers import get_keys_abbreviation
24
  from gedi.utils.io_helpers import read_csvs, select_instance
gedi/utils/io_helpers.py CHANGED
@@ -6,7 +6,7 @@ import re
6
  import shutil
7
  import numpy as np
8
  from collections import defaultdict
9
- from pathlib import Path, PurePath
10
  from scipy.spatial.distance import euclidean
11
 
12
  def select_instance(source_dir, log_path, destination=os.path.join("output","generated","instance_selection")):
@@ -86,7 +86,11 @@ def dump_features_json(features: dict, output_path, content_type="features"):
86
 
87
  def compute_similarity(v1, v2):
88
 
89
- # HOTFIX: Rename 'ratio_unique_traces_per_trace
 
 
 
 
90
  if 'ratio_unique_traces_per_trace' in v1:
91
  v1['ratio_variants_per_number_of_traces'] = v1.pop('ratio_unique_traces_per_trace')
92
 
@@ -104,7 +108,7 @@ def compute_similarity(v1, v2):
104
 
105
  else:
106
  # Calculate Euclidean Similarity
107
- target_similarity = 1-euclidean(vec1, vec2)
108
- #print("VECTORS: ", vec1, vec2, target_similarity)
109
 
110
  return target_similarity
 
6
  import shutil
7
  import numpy as np
8
  from collections import defaultdict
9
+ from pathlib import PurePath
10
  from scipy.spatial.distance import euclidean
11
 
12
  def select_instance(source_dir, log_path, destination=os.path.join("output","generated","instance_selection")):
 
86
 
87
  def compute_similarity(v1, v2):
88
 
89
+ # Convert all values to float except for the value for the key "Log"
90
+ v1 = {k: (float(v) if k != "log" else v) for k, v in v1.items()}
91
+ v2 = {k: (float(v) if k != "log" else v) for k, v in v2.items()}
92
+
93
+ # HOTFIX: Rename 'ratio_unique_traces_per_trace'
94
  if 'ratio_unique_traces_per_trace' in v1:
95
  v1['ratio_variants_per_number_of_traces'] = v1.pop('ratio_unique_traces_per_trace')
96
 
 
108
 
109
  else:
110
  # Calculate Euclidean Similarity
111
+ target_similarity = 1 / (1 + euclidean(vec1, vec2))
112
+ # print("VECTORS: ", vec1, vec2, target_similarity)
113
 
114
  return target_similarity
gedi/utils/logo.png ADDED
setup.py CHANGED
@@ -1,4 +1,4 @@
1
- from setuptools import setup, find_packages
2
  import os
3
 
4
  with open("README.md", "r") as fh:
 
1
+ from setuptools import setup
2
  import os
3
 
4
  with open("README.md", "r") as fh:
utils/config_fabric.py CHANGED
@@ -1,24 +1,59 @@
1
- from copy import deepcopy
2
- from importlib import reload
3
  from itertools import product as cproduct
4
  from itertools import combinations
 
5
  from pylab import *
6
- import itertools
7
  import json
8
  import math
9
  import os
10
  import pandas as pd
11
- import random
12
  import streamlit as st
13
  import subprocess
 
 
14
 
15
  st.set_page_config(layout='wide')
16
  INPUT_XES="output/inputlog_temp.xes"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- """
19
- # Configuration File fabric for
20
- ## GEDI: **G**enerating **E**vent **D**ata with **I**ntentional Features for Benchmarking Process Mining
21
- """
22
  def double_switch(label_left, label_right, third_label=None, fourth_label=None):
23
  if third_label==None and fourth_label==None:
24
  # Create two columns for the labels and toggle switch
@@ -185,10 +220,11 @@ def set_generator_experiments(generator_params):
185
  with col2:
186
  sel_features = feature_select()
187
 
 
188
  values_indexes = ["value "+str(i+1) for i in range(num_values)]
189
  values_defaults = ['*(1+2*0.'+str(i)+')' for i in range(num_values)]
190
  cross_labels = [feature[0]+': '+feature[1] for feature in list(cproduct(sel_features,values_indexes))]
191
- cross_values = [round(eval(str(combination[0])+combination[1]), 2) for combination in list(cproduct(list(generator_params['experiment'].values()), values_defaults))]
192
  parameters = split_list(list(input_multicolumn(cross_labels, cross_values, n_cols=num_values)), len(sel_features))
193
  tasks = f"list({parameters})"
194
 
@@ -234,6 +270,7 @@ def set_generator_experiments(generator_params):
234
  return generator_params
235
 
236
  if __name__ == '__main__':
 
237
  config_layout = json.load(open("config_files/config_layout.json"))
238
  type(config_layout)
239
  step_candidates = ["instance_augmentation","event_logs_generation","feature_extraction","benchmark_test"]
@@ -266,25 +303,65 @@ if __name__ == '__main__':
266
  output_path = st.text_input("Output file path", "config_files/experiment_config.json")
267
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
268
  save_labels = ["Save config file", "Save and run config_file"]
269
- save_labels = ["Save configuration file"]
270
- #create_button, create_run_button = multi_button(save_labels)
271
- create_button = multi_button(save_labels)
272
- # FIXME: Bug: automatically updates the experiment_config.json file even without pressing the save button
273
- if create_button: # or create_run_button:
274
  with open(output_path, "w") as f:
275
  f.write(config_file)
276
  st.write("Saved configuration in ", output_path, ". Run command:")
277
- #if create_run_button:
278
- if True:
279
- var = f"python -W ignore main.py -a {output_path}"
280
- st.code(var, language='bash')
281
- if False: #FIXME: Command fails when using multiprocessing
282
- command = var.split()
283
 
284
- # Run the command
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  result = subprocess.run(command, capture_output=True, text=True)
286
-
287
- if len(result.stderr)==0:
288
- st.write(result.stdout)
289
- else:
290
- st.write("ERROR: ", result.stderr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from itertools import product as cproduct
2
  from itertools import combinations
3
+ from pathlib import Path
4
  from pylab import *
5
+ import base64
6
  import json
7
  import math
8
  import os
9
  import pandas as pd
 
10
  import streamlit as st
11
  import subprocess
12
+ import time
13
+ import shutil
14
 
15
  st.set_page_config(layout='wide')
16
  INPUT_XES="output/inputlog_temp.xes"
17
+ LOGO_PATH="gedi/utils/logo.png"
18
+
19
+ def get_base64_image(image_path):
20
+ with open(image_path, "rb") as image_file:
21
+ return base64.b64encode(image_file.read()).decode()
22
+
23
+ def play_header():
24
+ # Convert local image to base64
25
+ logo_base64 = get_base64_image(LOGO_PATH)
26
+
27
+ # HTML and CSS for placing the logo at the top left corner
28
+ head1, head2 = st.columns([1,8])
29
+ head1.markdown(
30
+ f"""
31
+ <style>
32
+ .header-logo {{
33
+ display: flex;
34
+ align-items: center;
35
+ justify-content: flex-start;
36
+ }}
37
+ .header-logo img {{
38
+ max-width: 120px; /* Adjust the size as needed */
39
+ height: auto;
40
+ }}
41
+ </style>
42
+ <div class="header-logo">
43
+ <img src="data:image/png;base64,{logo_base64}" alt="Logo">
44
+ </div>
45
+ """,
46
+ unsafe_allow_html=True
47
+ )
48
+ with head2:
49
+ """
50
+ # interactive GEDI
51
+ """
52
+ """
53
+ ## **G**enerating **E**vent **D**ata with **I**ntentional Features for Benchmarking Process Mining
54
+ """
55
+ return
56
 
 
 
 
 
57
  def double_switch(label_left, label_right, third_label=None, fourth_label=None):
58
  if third_label==None and fourth_label==None:
59
  # Create two columns for the labels and toggle switch
 
220
  with col2:
221
  sel_features = feature_select()
222
 
223
+ filtered_dict = {key: generator_params['experiment'][key] for key in sel_features if key in generator_params['experiment']}
224
  values_indexes = ["value "+str(i+1) for i in range(num_values)]
225
  values_defaults = ['*(1+2*0.'+str(i)+')' for i in range(num_values)]
226
  cross_labels = [feature[0]+': '+feature[1] for feature in list(cproduct(sel_features,values_indexes))]
227
+ cross_values = [round(eval(str(combination[0])+combination[1]), 2) for combination in list(cproduct(list(filtered_dict.values()), values_defaults))]
228
  parameters = split_list(list(input_multicolumn(cross_labels, cross_values, n_cols=num_values)), len(sel_features))
229
  tasks = f"list({parameters})"
230
 
 
270
  return generator_params
271
 
272
  if __name__ == '__main__':
273
+ play_header()
274
  config_layout = json.load(open("config_files/config_layout.json"))
275
  type(config_layout)
276
  step_candidates = ["instance_augmentation","event_logs_generation","feature_extraction","benchmark_test"]
 
303
  output_path = st.text_input("Output file path", "config_files/experiment_config.json")
304
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
305
  save_labels = ["Save config file", "Save and run config_file"]
306
+ #save_labels = ["Save configuration file"]
307
+ create_button, create_run_button = multi_button(save_labels)
308
+ #create_button = multi_button(save_labels)
309
+
310
+ if create_button or create_run_button:
311
  with open(output_path, "w") as f:
312
  f.write(config_file)
313
  st.write("Saved configuration in ", output_path, ". Run command:")
314
+ create_button = False
315
+ var = f"python -W ignore main.py -a {output_path}"
316
+ st.code(var, language='bash')
 
 
 
317
 
318
+ if create_run_button:
319
+ command = var.split()
320
+ progress_bar = st.progress(0) # Initialize the progress bar
321
+
322
+ directory = Path(step_config['output_path']).parts
323
+ path = os.path.join(directory[0], 'features', *directory[1:])
324
+ if os.path.exists(path): shutil.rmtree(path)
325
+
326
+ # Simulate running the command with a loop and updating the progress bar
327
+ for i in range(95):
328
+ time.sleep(0.2) # Simulate the time taken for each step
329
+ progress_bar.progress(i + 1)
330
+
331
+ # Run the actual command
332
  result = subprocess.run(command, capture_output=True, text=True)
333
+ st.write("## Results")
334
+ # st.write(*step_config['generator_params']['experiment'][0].keys(), "log name", "target similarity")
335
+
336
+ directory = Path(step_config['output_path']).parts
337
+ path = os.path.join(directory[0], 'features', *directory[1:])
338
+
339
+ dataframes = []
340
+ # Walk through all directories and files
341
+ for root, dirs, files in os.walk(path):
342
+ feature_files = [os.path.join(root, file) for file in files]
343
+ for feature_file in feature_files:
344
+
345
+ df_temp = pd.read_json(feature_file,lines=True)
346
+ dataframes.append(df_temp)
347
+ # Print the contents of the JSON file
348
+ # st.write(*config_targets.values(), data['log'], data['target_similarity'])
349
+ dataframes = pd.concat(dataframes, ignore_index=True)
350
+ # dataframes = dataframes.sort_values(by=['log'])
351
+ dataframes = dataframes.set_index('log')
352
+ col1, col2 = st.columns([2, 3]) # Adjust the ratio as needed
353
+
354
+ with col1:
355
+ st.dataframe(dataframes)
356
+
357
+ with col2:
358
+ plt.figure(figsize=(4, 2))
359
+ plt.plot(dataframes.index, dataframes['target_similarity'], 'o-')
360
+ plt.xlabel('log', fontsize=5)
361
+ plt.ylabel('target_similarity', fontsize=5)
362
+ plt.xticks(rotation=45, ha='right', fontsize=5)
363
+ plt.tight_layout()
364
+ st.pyplot(plt)
365
+
366
+ # Optional: Updating the progress bar to indicate completion
367
+ progress_bar.progress(100)
utils/merge_csvs.py CHANGED
@@ -1,9 +1,7 @@
1
  import os
2
  import pandas as pd
3
  import sys
4
- import tqdm
5
 
6
- from gedi.utils.io_helpers import sort_files
7
 
8
  FILE_START = sys.argv[1]
9
  ROOT_PATH, FILE_START = os.path.split(FILE_START)
 
1
  import os
2
  import pandas as pd
3
  import sys
 
4
 
 
5
 
6
  FILE_START = sys.argv[1]
7
  ROOT_PATH, FILE_START = os.path.split(FILE_START)