Andrea Maldonado commited on
Commit
277c251
·
1 Parent(s): 1296e6b

WIP Adds config file fabric

Browse files
README.md CHANGED
@@ -63,7 +63,7 @@ The JSON file consists of the following key-value pairs:
63
  - real_eventlog_path: defines the file with the features extracted from the real event logs
64
  - plot_type: defines the style of the output plotting (possible values: violinplot, boxplot)
65
  - font_size: label font size of the output plot
66
- - boxplot_widht: width of the violinplot/boxplot
67
 
68
 
69
  ### Generation
@@ -153,8 +153,16 @@ To execute the experiments with grid targets, a single [configuration](config_fi
153
  conda activate gedi
154
  python execute_grid_experiments.py config_files/algorithm/grid_2obj
155
  ```
156
- We employ the [experiment_grid_2obj_configfiles_fabric.ipynb](notebooks/experiment_grid_2obj_configfiles_fabric.ipynb) to create all necessary [configuration](config_files/algorithm/grid_2obj) and [objective](data/grid_2obj) files for this experiment. For more details about these config_files, please refer to [Feature Extraction](#feature-extraction), [Generation](#generation), and [Benchmark](#benchmark).
 
 
 
 
157
 
 
 
 
 
158
  ### Visualizations
159
  To run the visualizations, we employ [jupyter notebooks](https://jupyter.org/install) and [add the installed environment to the jupyter notebook](https://medium.com/@nrk25693/how-to-add-your-conda-environment-to-your-jupyter-notebook-in-just-4-steps-abeab8b8d084). We then start all visualizations by running e.g.: `jupyter noteboook`. In the following, we describe the `.ipynb`-files in the folder `\notebooks` to reproduce the figures from our paper.
160
 
 
63
  - real_eventlog_path: defines the file with the features extracted from the real event logs
64
  - plot_type: defines the style of the output plotting (possible values: violinplot, boxplot)
65
  - font_size: label font size of the output plot
66
+ - boxplot_width: width of the violinplot/boxplot
67
 
68
 
69
  ### Generation
 
153
  conda activate gedi
154
  python execute_grid_experiments.py config_files/algorithm/grid_2obj
155
  ```
156
+ We employ the [experiment_grid_2obj_configfiles_fabric.ipynb](notebooks/experiment_grid_2obj_configfiles_fabric.ipynb) to create all necessary [configuration](config_files/algorithm/grid_2obj) and [objective](data/grid_2obj) files for this experiment.
157
+ For more details about these config_files, please refer to [Feature Extraction](#feature-extraction), [Generation](#generation), and [Benchmark](#benchmark).
158
+ To create configuration files for grid objectives interactively, you can use the start the following dashboard:
159
+ ```
160
+ streamlit run utils/config_fabric.py # To tunnel to local machine add: --server.port 8501 --server.headless true
161
 
162
+ # In local machine (only in case you are tunneling):
163
+ ssh -N -f -L 9000:localhost:8501 <user@remote_machine.com>
164
+ open "http://localhost:9000/"
165
+ ```
166
  ### Visualizations
167
  To run the visualizations, we employ [jupyter notebooks](https://jupyter.org/install) and [add the installed environment to the jupyter notebook](https://medium.com/@nrk25693/how-to-add-your-conda-environment-to-your-jupyter-notebook-in-just-4-steps-abeab8b8d084). We then start all visualizations by running e.g.: `jupyter noteboook`. In the following, we describe the `.ipynb`-files in the folder `\notebooks` to reproduce the figures from our paper.
168
 
config_files/config_layout.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "pipeline_step": "instance_augmentation",
4
+ "augmentation_params":{"method":"SMOTE", "no_samples":2,
5
+ "feature_selection": ["ratio_top_20_variants", "epa_normalized_sequence_entropy_linear_forgetting"]},
6
+ "input_path": "data/test/bpic_features.csv",
7
+ "output_path": "output"
8
+ },
9
+ {
10
+ "pipeline_step": "event_logs_generation",
11
+ "output_path": "output/features/2_bpic_features/2_ense_rmcv_feat.csv",
12
+ "output_path": "data/test",
13
+ "generator_params": {
14
+ "experiment": "data/grid_objectives.csv",
15
+ "experiment": {"input_path": "data/2_bpic_features.csv",
16
+ "objectives": ["ratio_top_20_variants", "epa_normalized_sequence_entropy_linear_forgetting"]},
17
+ "experiment": [
18
+ {"epa_normalized_sequence_entropy_linear_forgetting": 0.2, "ratio_top_20_variants": 0.4},
19
+ {"epa_normalized_sequence_entropy_linear_forgetting": 0.4, "ratio_top_20_variants": 0.7}
20
+ ],
21
+ "experiment": {"epa_normalized_sequence_entropy_linear_forgetting": 0.2, "ratio_top_20_variants": 0.4},
22
+ "config_space": {
23
+ "mode": [5, 20],
24
+ "sequence": [0.01, 1],
25
+ "choice": [0.01, 1],
26
+ "parallel": [0.01, 1],
27
+ "loop": [0.01, 1],
28
+ "silent": [0.01, 1],
29
+ "lt_dependency": [0.01, 1],
30
+ "num_traces": [10, 100],
31
+ "duplicate": [0],
32
+ "or": [0]
33
+ },
34
+ "n_trials": 2
35
+ }
36
+ },
37
+ {
38
+ "pipeline_step": "feature_extraction",
39
+ "input_path": "data/test",
40
+ "feature_params": {"feature_set":["trace_length"]},
41
+ "output_path": "output/plots",
42
+ "real_eventlog_path": "data/BaselineED_feat.csv",
43
+ "plot_type": "boxplot"
44
+ },
45
+ {
46
+ "pipeline_step": "benchmark_test",
47
+ "benchmark_task": "discovery",
48
+ "input_path":"data/test",
49
+ "output_path":"output",
50
+ "miners" : ["inductive", "heu", "imf", "ilp"]
51
+ }
52
+ ]
utils/config_fabric.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from copy import deepcopy
2
+ from importlib import reload
3
+ from itertools import product
4
+ from pylab import *
5
+ import itertools
6
+ import json
7
+ import os
8
+ import pandas as pd
9
+ import pm4py
10
+ import random
11
+ import streamlit as st
12
+
13
+ st.set_page_config(layout='wide')
14
+ INPUT_XES="output/inputlog_temp.xes"
15
+
16
+ """
17
+ # Configuration File fabric for
18
+ ## GEDI: **G**enerating **E**vent **D**ata with **I**ntentional Features for Benchmarking Process Mining
19
+ """
20
+ def double_switch(label_left, label_right):
21
+ # Create two columns for the labels and toggle switch
22
+ col0, col1, col2, col3, col4 = st.columns([4,1, 1, 1,4])
23
+
24
+ # Add labels to the columns
25
+ with col1:
26
+ st.write(label_left)
27
+
28
+ with col2:
29
+ # Create the toggle switch
30
+ toggle_option = st.toggle(" ",value=False,
31
+ key="toggle_switch_"+label_left,
32
+ )
33
+
34
+ with col3:
35
+ st.write(label_right)
36
+ return toggle_option
37
+
38
+ def view(config_file):
39
+ st.write(config_file)
40
+
41
+ def get_ranges(stats, tuple_values):
42
+ col_for_row = ", ".join([f"x[\'{i}\'].astype(float)" for i in tuple_values])
43
+ stats['range'] = stats.apply(lambda x: tuple([eval(col_for_row)]), axis=1)
44
+ #tasks = eval(f"list(itertools.product({(parameters*n_para_obj)[:-2]}))")
45
+ result = [f"np.around({x}, 2)" for x in stats['range']]
46
+ result = ", ".join(result)
47
+ return result
48
+
49
+ def create_objectives_grid(df, objectives, n_para_obj=2):
50
+ parameters_o = "objectives, "
51
+ sel_features = df.index.to_list()
52
+ if n_para_obj==len(objectives):
53
+ parameters = get_ranges(df, sorted(objectives))
54
+ tasks = eval(f"list(itertools.product({parameters}))")[0]
55
+ cartesian_product = list(product(*tasks))
56
+ experiments = [{key: value[idx] for idx, key in enumerate(sel_features)} for value in cartesian_product]
57
+ return experiments
58
+ else:
59
+ if n_para_obj==1:
60
+ experiments = [[exp] for exp in objectives]
61
+ else:
62
+ experiments = eval(f"[exp for exp in list(itertools.product({(parameters_o*n_para_obj)[:-2]})) if exp[0]!=exp[1]]")
63
+ experiments = list(set([tuple(sorted(exp)) for exp in experiments]))
64
+ parameters = "np.around(np.arange(0.0, 1.5,0.5),2), "
65
+ tasks = eval(f"list(itertools.product({(parameters*n_para_obj)[:-2]}))")
66
+ print("TASKS", tasks, type(parameters), type(n_para_obj), parameters*n_para_obj)
67
+ #print(len(experiments), experiments)
68
+
69
+ print(len(tasks))
70
+
71
+ for exp in experiments:
72
+ df = pd.DataFrame(data=tasks, columns=["task", *exp])
73
+ #experiment_path = os.path.join('..','data', f'grid_{n_para_obj}obj')
74
+ #os.makedirs(experiment_path, exist_ok=True)
75
+ #experiment_path = os.path.join(experiment_path, f"grid_{len(df.columns)-1}objectives_{abbrev_obj_keys(exp)}.csv")
76
+ #df.to_csv(experiment_path, index=False)
77
+ #print(f"Saved experiment in {experiment_path}")
78
+ #write_generator_experiment(experiment_path, objectives=exp)
79
+
80
+ def set_up(generator_params):
81
+ create_button = False
82
+ experiments = []
83
+
84
+ col1, col2 = st.columns(2)
85
+ if True:
86
+ grid_option = double_switch("Point-", "Grid-based")
87
+ csv_option = double_switch("Manual", "From CSV")
88
+ if csv_option:
89
+ uploaded_file = st.file_uploader(f"Pick a csv-file containing feature values for features:", type="csv")
90
+ if uploaded_file is not None:
91
+ df = pd.read_csv(uploaded_file)
92
+ sel_features = st.multiselect("Selected features", list(df.columns))
93
+ df = df[sel_features]
94
+ if grid_option:
95
+ add_quantile = st.slider('Add %-quantile', min_value=0.0, max_value=100.0, value=50.0, step=5.0)
96
+ stats = df.describe().transpose()
97
+ stats[str(int(add_quantile))+"%"] = df.quantile(q=add_quantile/100)
98
+ view(stats)
99
+ tuple_values = st.multiselect("Tuples including", list(stats.columns)[3:], default=['min', 'max'])
100
+ experiments = create_objectives_grid(stats, tuple_values, n_para_obj=len(tuple_values))
101
+ else:
102
+ view(df)
103
+ experiments = df.to_dict(orient='records')
104
+ else:
105
+ sel_features = st.multiselect("Selected features", list(generator_params['experiment'].keys()))
106
+ for sel_feature in sel_features:
107
+ generator_params['experiment'][sel_feature] = float(st.text_input(sel_feature, generator_params['experiment'][sel_feature]))
108
+ generator_params['experiment'] = experiments
109
+ st.write(f"...result in {len(generator_params['experiment'])} experiments")
110
+
111
+ """
112
+ #### Configuration space
113
+ """
114
+ for key in generator_params['config_space'].keys():
115
+ generator_params['config_space'][key] = st.text_input(key, generator_params['config_space'][key])
116
+
117
+ #generator_params['config_space'] = st.text_input('config_space', generator_params['config_space'])
118
+ generator_params['n_trials'] = int(st.text_input('n_trials', generator_params['n_trials']))
119
+ return generator_params
120
+
121
+ if __name__ == '__main__':
122
+ config_layout = json.load(open("config_files/config_layout.json"))
123
+ type(config_layout)
124
+ step_candidates = ["instance_augmentation","event_logs_generation","feature_extraction","benchmark_test"]
125
+ pipeline_steps = st.multiselect(
126
+ "Choose pipeline step",
127
+ step_candidates,
128
+ []
129
+ )
130
+ step_configs = []
131
+ set_col, view_col = st.columns([3, 2])
132
+ for pipeline_step in pipeline_steps:
133
+ step_config = [d for d in config_layout if d['pipeline_step'] == pipeline_step][0]
134
+ with set_col:
135
+ st.header(pipeline_step)
136
+ for step_key in step_config.keys():
137
+ if step_key == "generator_params":
138
+ st.subheader("Set-up experiments")
139
+ step_config[step_key] = set_up(step_config[step_key])
140
+ elif step_key != "pipeline_step":
141
+ step_config[step_key] = st.text_input(step_key, step_config[step_key])
142
+ with view_col:
143
+ view(step_config)
144
+ step_configs.append(step_config)
145
+ config_file = json.dumps(step_configs, indent=4)
146
+ output_path = st.text_input("Output file path", "config_files/experiment_config.json")
147
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
148
+ create_button = st.button("Save config file")
149
+ if create_button:
150
+ with open(output_path, "w") as f:
151
+ f.write(config_file)