Andrea MH commited on
Commit
1d56290
Β·
unverified Β·
2 Parent(s): 3be42cd bf97d18

Merge pull request #24 from lmu-dbs/demo-icpm24

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. .github/workflows/huggingface.yml +28 -0
  2. .github/workflows/pypi_release.yml +101 -0
  3. .github/workflows/test_gedi.yml +50 -10
  4. README.md +288 -23
  5. config.py +6 -69
  6. config_files/config_layout.json +48 -0
  7. config_files/{algorithm/experiment_real_targets.json β†’ experiment_real_targets.json} +0 -0
  8. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_ense_enseef.json +0 -0
  9. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_ense_enself.json +0 -0
  10. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_ense_enve.json +0 -0
  11. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_ense_rmcv.json +0 -0
  12. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_ense_rt10v.json +0 -0
  13. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_ense_rvpnot.json +0 -0
  14. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enseef_enself.json +0 -0
  15. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enseef_enve.json +0 -0
  16. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enseef_rmcv.json +0 -0
  17. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enseef_rt10v.json +0 -0
  18. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enseef_rvpnot.json +0 -0
  19. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enself_enve.json +0 -0
  20. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enself_rmcv.json +0 -0
  21. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enself_rt10v.json +0 -0
  22. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enself_rvpnot.json +0 -0
  23. config_files/grid_2obj/generator_grid_2objectives_enve_mvo.json +1 -0
  24. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enve_rmcv.json +0 -0
  25. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enve_rt10v.json +0 -0
  26. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enve_rvpnot.json +0 -0
  27. config_files/grid_2obj/generator_grid_2objectives_enve_sam.json +1 -0
  28. config_files/grid_2obj/generator_grid_2objectives_mvo_sam.json +1 -0
  29. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_rmcv_rt10v.json +0 -0
  30. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_rmcv_rvpnot.json +0 -0
  31. config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_rt10v_rvpnot.json +0 -0
  32. config_files/options/baseline.json +0 -9
  33. config_files/options/run_params.json +0 -9
  34. config_files/{algorithm/pipeline_steps β†’ pipeline_steps}/augmentation.json +0 -0
  35. config_files/{algorithm/pipeline_steps β†’ pipeline_steps}/benchmark.json +1 -1
  36. config_files/{algorithm/pipeline_steps β†’ pipeline_steps}/evaluation_plotter.json +0 -0
  37. config_files/{algorithm/pipeline_steps β†’ pipeline_steps}/feature_extraction.json +0 -0
  38. config_files/{algorithm/pipeline_steps β†’ pipeline_steps}/generation.json +0 -0
  39. config_files/{algorithm β†’ test}/experiment_test.json +0 -0
  40. config_files/{algorithm/test β†’ test}/generator_2bpic_2objectives_ense_enseef.json +0 -0
  41. config_files/{algorithm/test β†’ test}/generator_grid_1objectives_rt10v.json +0 -0
  42. config_files/{algorithm/test β†’ test}/generator_grid_2objectives_ense_enself.json +0 -0
  43. config_files/test/test_abbrv_generation.json +16 -0
  44. data/test/grid_experiments/rt10v.csv +0 -12
  45. data/test/grid_feat.csv +2 -0
  46. data/test/igedi_table_1.csv +4 -0
  47. data/validation/2_ense_rmcv_feat.csv +4 -0
  48. data/validation/genELexperiment1_04_02.json +1 -1
  49. data/validation/genELexperiment3_04_nan.json +1 -0
  50. data/validation/genELexperiment4_nan_02.json +1 -0
.github/workflows/huggingface.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches:
5
+ - main
6
+ - demo-icpm24
7
+ # to run this workflow manually from the Actions tab
8
+ workflow_dispatch:
9
+
10
+ jobs:
11
+ sync-to-hub:
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - name: Check large files
15
+ uses: ActionsDesk/[email protected]
16
+ with:
17
+ filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
18
+ - uses: actions/checkout@v3
19
+ with:
20
+ fetch-depth: 0
21
+ lfs: true
22
+ - name: Set current branch as a variable
23
+ id: set_branch
24
+ run: echo "CURRENT_BRANCH=${GITHUB_REF##*/}" >> $GITHUB_ENV
25
+ - name: Push to hub
26
+ env:
27
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
28
+ run: git push --force https://andreamalhera:[email protected]/spaces/andreamalhera/igedi $CURRENT_BRANCH:main
.github/workflows/pypi_release.yml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Publish Python 🐍 distribution πŸ“¦ to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*.*.*' # Triggers the workflow when a new version tag is pushed
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Check out the code
14
+ uses: actions/checkout@v4
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v5
18
+ with:
19
+ python-version: '3.x' # Specify your Python version
20
+
21
+ - name: Install pypa/build
22
+ run: >-
23
+ python3 -m
24
+ pip install
25
+ build
26
+ --user
27
+ - name: Build a binary wheel and a source tarball
28
+ run: python3 -m build
29
+ - name: Store the distribution packages
30
+ uses: actions/upload-artifact@v3
31
+ with:
32
+ name: python-package-distributions
33
+ path: dist/
34
+
35
+ publish-to-pypi:
36
+ name: >-
37
+ Publish Python 🐍 distribution πŸ“¦ to PyPI
38
+ if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
39
+ needs:
40
+ - build
41
+ runs-on: ubuntu-latest
42
+ environment:
43
+ name: pypi
44
+ url: https://pypi.org/p/GEDI
45
+ permissions:
46
+ id-token: write # IMPORTANT: mandatory for trusted publishing
47
+
48
+ steps:
49
+ - name: Download all the dists
50
+ uses: actions/download-artifact@v3
51
+ with:
52
+ name: python-package-distributions
53
+ path: dist/
54
+ - name: Publish distribution πŸ“¦ to PyPI
55
+ uses: pypa/gh-action-pypi-publish@release/v1
56
+
57
+ github-release:
58
+ name: >-
59
+ Sign the Python 🐍 distribution πŸ“¦ with Sigstore
60
+ and upload them to GitHub Release
61
+ needs:
62
+ - publish-to-pypi
63
+ runs-on: ubuntu-latest
64
+
65
+ permissions:
66
+ contents: write # IMPORTANT: mandatory for making GitHub Releases
67
+ id-token: write # IMPORTANT: mandatory for sigstore
68
+
69
+ steps:
70
+ - name: Download all the dists
71
+ uses: actions/download-artifact@v3
72
+ with:
73
+ name: python-package-distributions
74
+ path: dist/
75
+ - name: Sign the dists with Sigstore
76
+ uses: sigstore/[email protected]
77
+ with:
78
+ inputs: >-
79
+ ./dist/*.tar.gz
80
+ ./dist/*.whl
81
+ - name: Create GitHub Release
82
+ env:
83
+ GITHUB_TOKEN: ${{ github.token }}
84
+ run: >-
85
+ gh release create
86
+ '${{ github.ref_name }}'
87
+ --repo '${{ github.repository }}'
88
+ --notes ""
89
+ - name: Upload artifact signatures to GitHub Release
90
+ env:
91
+ GITHUB_TOKEN: ${{ github.token }}
92
+ # Upload to GitHub Release using the `gh` CLI.
93
+ # `dist/` contains the built packages, and the
94
+ # sigstore-produced signatures and certificates.
95
+ run: >-
96
+ gh release upload
97
+ '${{ github.ref_name }}' dist/**
98
+ --repo '${{ github.repository }}'
99
+
100
+ - name: Cleanup
101
+ run: rm -rf dist
.github/workflows/test_gedi.yml CHANGED
@@ -31,7 +31,7 @@ jobs:
31
 
32
  - name: Run test
33
  run:
34
- python main.py -o config_files/options/baseline.json -a config_files/algorithm/pipeline_steps/feature_extraction.json
35
 
36
  - name: Compare output
37
  run: diff data/validation/test_feat.csv data/test_feat.csv
@@ -60,15 +60,23 @@ jobs:
60
 
61
  - name: Run test
62
  run:
63
- python main.py -o config_files/options/baseline.json -a config_files/algorithm/pipeline_steps/generation.json
64
 
65
  - name: Compare output 1
66
  run:
67
- diff data/validation/genELexperiment2_07_04.json output/features/grid_feat/2_enself_rt20v/genELexperiment2_07_04.json
68
 
69
  - name: Compare output 2
70
  run:
71
- diff data/validation/genELexperiment1_04_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment1_04_02.json
 
 
 
 
 
 
 
 
72
 
73
  test_benchmark:
74
  runs-on: ubuntu-latest
@@ -90,10 +98,12 @@ jobs:
90
 
91
  - name: Run test
92
  run:
93
- python main.py -o config_files/options/baseline.json -a config_files/algorithm/pipeline_steps/benchmark.json
94
 
95
  - name: Convert output and validation to same encoding
96
- run: iconv -f UTF-8 -t ASCII output/benchmark/test_benchmark.csv > data/validation/test_benchmark.csv
 
 
97
 
98
  - name: Compare output
99
  run: diff data/validation/test_benchmark.csv output/benchmark/test_benchmark.csv
@@ -118,7 +128,7 @@ jobs:
118
 
119
  - name: Run test
120
  run:
121
- python main.py -o config_files/options/baseline.json -a config_files/algorithm/pipeline_steps/augmentation.json
122
 
123
  test_evaluation-plotter:
124
  runs-on: ubuntu-latest
@@ -144,7 +154,7 @@ jobs:
144
 
145
  - name: Run test
146
  run:
147
- python main.py -o config_files/options/baseline.json -a config_files/algorithm/pipeline_steps/evaluation_plotter.json
148
 
149
  test_integration:
150
  runs-on: ubuntu-latest
@@ -170,7 +180,7 @@ jobs:
170
 
171
  - name: Run test
172
  run:
173
- python main.py -o config_files/options/baseline.json -a config_files/algorithm/experiment_test.json
174
 
175
  test_grid_experiments_script:
176
  runs-on: ubuntu-latest
@@ -196,10 +206,40 @@ jobs:
196
 
197
  - name: Run test
198
  run:
199
- python execute_grid_experiments.py config_files/algorithm/test
200
 
201
  - name: Convert output and validation to same encoding
202
  run: iconv -f UTF-8 -t ASCII output/features/generated/2_bpic_features/2_ense_enseef_feat.csv > data/validation/2_ense_enseef_feat.csv
203
 
204
  - name: Compare output
205
  run: diff data/validation/2_ense_enseef_feat.csv output/features/generated/2_bpic_features/2_ense_enseef_feat.csv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  - name: Run test
33
  run:
34
+ python main.py -a config_files/pipeline_steps/feature_extraction.json
35
 
36
  - name: Compare output
37
  run: diff data/validation/test_feat.csv data/test_feat.csv
 
60
 
61
  - name: Run test
62
  run:
63
+ python main.py -a config_files/pipeline_steps/generation.json
64
 
65
  - name: Compare output 1
66
  run:
67
+ diff data/validation/genELexperiment1_04_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment1_04_02.json
68
 
69
  - name: Compare output 2
70
  run:
71
+ diff data/validation/genELexperiment2_07_04.json output/features/grid_feat/2_enself_rt20v/genELexperiment2_07_04.json
72
+
73
+ - name: Compare output 3
74
+ run:
75
+ diff data/validation/genELexperiment3_04_nan.json output/features/grid_feat/2_enself_rt20v/genELexperiment3_04_nan.json
76
+
77
+ - name: Compare output 4
78
+ run:
79
+ diff data/validation/genELexperiment4_nan_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment4_nan_02.json
80
 
81
  test_benchmark:
82
  runs-on: ubuntu-latest
 
98
 
99
  - name: Run test
100
  run:
101
+ python main.py -a config_files/pipeline_steps/benchmark.json
102
 
103
  - name: Convert output and validation to same encoding
104
+ run: |
105
+ iconv -f UTF-8 -t ASCII data/validation/test_benchmark.csv > data/validation/test_benchmark.csv
106
+ iconv -f UTF-8 -t ASCII output/benchmark/test_benchmark.csv > output/benchmark/test_benchmark.csv
107
 
108
  - name: Compare output
109
  run: diff data/validation/test_benchmark.csv output/benchmark/test_benchmark.csv
 
128
 
129
  - name: Run test
130
  run:
131
+ python main.py -a config_files/pipeline_steps/augmentation.json
132
 
133
  test_evaluation-plotter:
134
  runs-on: ubuntu-latest
 
154
 
155
  - name: Run test
156
  run:
157
+ python main.py -a config_files/pipeline_steps/evaluation_plotter.json
158
 
159
  test_integration:
160
  runs-on: ubuntu-latest
 
180
 
181
  - name: Run test
182
  run:
183
+ python main.py -a config_files/test/experiment_test.json
184
 
185
  test_grid_experiments_script:
186
  runs-on: ubuntu-latest
 
206
 
207
  - name: Run test
208
  run:
209
+ python gedi/utils/execute_grid_experiments.py config_files/test
210
 
211
  - name: Convert output and validation to same encoding
212
  run: iconv -f UTF-8 -t ASCII output/features/generated/2_bpic_features/2_ense_enseef_feat.csv > data/validation/2_ense_enseef_feat.csv
213
 
214
  - name: Compare output
215
  run: diff data/validation/2_ense_enseef_feat.csv output/features/generated/2_bpic_features/2_ense_enseef_feat.csv
216
+
217
+ test_abbrv:
218
+ runs-on: ubuntu-latest
219
+
220
+ # Setting up a python envronment for the test script to run
221
+ steps:
222
+ - name: Checkout code
223
+ uses: actions/checkout@v4
224
+
225
+ - name: Set up Python
226
+ uses: actions/setup-python@v5
227
+ with:
228
+ python-version: 3.9
229
+
230
+ - name: Install dependencies
231
+ run: |
232
+ sudo apt-get install build-essential python3 python3-dev
233
+
234
+ - name: Install feeed
235
+ run: |
236
+ python -m pip install --upgrade pip
237
+ pip install .
238
+
239
+ - name: Run test
240
+ run:
241
+ python main.py -a config_files/test/test_abbrv_generation.json
242
+
243
+ - name: Compare output
244
+ run:
245
+ diff data/validation/2_ense_rmcv_feat.csv output/test/igedi_table_1/2_ense_rmcv_feat.csv
README.md CHANGED
@@ -1,15 +1,36 @@
1
- # GEDI
2
- **G**enerating **E**vent **D**ata with **I**ntentional Features for Benchmarking Process Mining<br />
3
- Codebase for the [GEDI paper](https://mcml.ai/publications/gedi.pdf) published at the [BPM'24 conference proceedings](https://link.springer.com/book/10.1007/978-3-031-70396-6).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  ## Table of Contents
6
 
7
- - [Requirements](#requirements)
 
8
  - [Installation](#installation)
9
  - [General Usage](#general-usage)
10
  - [Experiments](#experiments)
11
  - [Citation](#citation)
12
 
 
 
 
 
13
  ## Requirements
14
  - [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
15
  - Graphviz on your OS e.g.
@@ -28,7 +49,7 @@ conda install pyrfr swig
28
  ### Startup
29
  ```console
30
  conda activate gedi
31
- python main.py -o config_files/options/baseline.json -a config_files/algorithm/experiment_test.json
32
  ```
33
  The last step should take only a few minutes to run.
34
 
@@ -42,18 +63,18 @@ Our pipeline offers several pipeline steps, which can be run sequentially or par
42
  To run different steps of the GEDI pipeline, please adapt the `.json` accordingly.
43
  ```console
44
  conda activate gedi
45
- python main.py -o config_files/options/baseline.json -a config_files/algorithm/pipeline_steps/<pipeline-step>.json
46
  ```
47
- For reference of possible keys and values for each step, please see `config_files/algorithm/experiment_test.json`.
48
  To run the whole pipeline please create a new `.json` file, specifying all steps you want to run and specify desired keys and values for each step.
49
- To reproduce results from out paper, please refer to [Experiments](#experiments).
50
 
51
  ### Feature Extraction
52
  ---
53
  To extract the features on the event-log level and use them for hyperparameter optimization, we employ the following script:
54
  ```console
55
  conda activate gedi
56
- python main.py -o config_files/options/baseline.json -a config_files/algorithm/pipeline_steps/feature_extraction.json
57
  ```
58
  The JSON file consists of the following key-value pairs:
59
 
@@ -64,8 +85,7 @@ The JSON file consists of the following key-value pairs:
64
  - real_eventlog_path: defines the file with the features extracted from the real event logs
65
  - plot_type: defines the style of the output plotting (possible values: violinplot, boxplot)
66
  - font_size: label font size of the output plot
67
- - boxplot_widht: width of the violinplot/boxplot
68
-
69
 
70
  ### Generation
71
  ---
@@ -75,7 +95,7 @@ The command to execute the generation step is given by a exemplarily generation.
75
 
76
  ```console
77
  conda activate gedi
78
- python main.py -o config_files/options/baseline.json -a config_files/algorithm/pipeline_steps/generation.json
79
  ```
80
 
81
  In the `generation.json`, we have the following key-value pairs:
@@ -102,12 +122,228 @@ In the `generation.json`, we have the following key-value pairs:
102
 
103
  - plot_reference_feature: defines the feature, which is used on the x-axis on the output plots, i.e., each feature defined in the 'objectives' of the 'experiment' is plotted against the reference feature being defined in this value
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  ### Benchmark
106
  The benchmarking defines the downstream task which is used for evaluating the goodness of the synthesized event log datasets with the metrics of real-world datasets. The command to execute a benchmarking is shown in the following script:
107
 
108
  ```console
109
  conda activate gedi
110
- python main.py -o config_files/options/baseline.json -a config_files/algorithm/pipeline_steps/benchmark.json
111
  ```
112
 
113
  In the `benchmark.json`, we have the following key-value pairs:
@@ -125,7 +361,7 @@ The purpose of the evaluation plotting step is used just for visualization. Some
125
 
126
  ```console
127
  conda activate gedi
128
- python main.py -o config_files/options/baseline.json -a config_files/algorithm/pipeline_steps/evaluation_plotter.json
129
  ```
130
 
131
  Generally, in the `evaluation_plotter.json`, we have the following key-value pairs:
@@ -141,26 +377,35 @@ In this repository, experiments can be run selectively or from scratch, as prefe
141
  We present two settings for generating intentional event logs, using [real targets](#generating-data-with-real-targets) or using [grid targets](#generating-data-with-grid-targets). Both settings output `.xes` event logs, `.json` and `.csv` files containing feature values, as well as evaluation results, from running a [process discovery benchmark](#benchmark), for the generated event logs.
142
 
143
  ### Generating data with real targets
144
- To execute the experiments with real targets, we employ the [experiment_real_targets.json](config_files/algorithm/experiment_real_targets.json). The script's pipeline will output the [generated event logs (GenBaselineED)](data/event_logs/GenBaselineED), which optimize their feature values towards [real-world event data features](data/BaselineED_feat.csv), alongside their respectively measured [feature values](data/GenBaselineED_feat.csv) and [benchmark metrics values](data/GenBaselineED_bench.csv).
145
 
146
  ```console
147
  conda activate gedi
148
- python main.py -o config_files/options/baseline.json -a config_files/algorithm/experiment_real_targets.json
149
  ```
150
 
151
  ### Generating data with grid targets
152
- To execute the experiments with grid targets, a single [configuration](config_files/algorithm/grid_2obj) can be selected or all [grid objectives](data/grid_2obj) can be run with one command using the following script. This script will output the [generated event logs (GenED)](data/event_logs/GenED), alongside their respectively measured [feature values](data/GenED_feat.csv) and [benchmark metrics values](data/GenED_bench.csv).
153
  ```
154
  conda activate gedi
155
- python execute_grid_experiments.py config_files/algorithm/grid_2obj
 
 
 
 
 
 
 
 
 
 
156
  ```
157
- We employ the [experiment_grid_2obj_configfiles_fabric.ipynb](notebooks/experiment_grid_2obj_configfiles_fabric.ipynb) to create all necessary [configuration](config_files/algorithm/grid_2obj) and [objective](data/grid_2obj) files for this experiment. For more details about these config_files, please refer to [Feature Extraction](#feature-extraction), [Generation](#generation), and [Benchmark](#benchmark).
158
 
159
  ### Visualizations
160
  To run the visualizations, we employ [jupyter notebooks](https://jupyter.org/install) and [add the installed environment to the jupyter notebook](https://medium.com/@nrk25693/how-to-add-your-conda-environment-to-your-jupyter-notebook-in-just-4-steps-abeab8b8d084). We then start all visualizations by running e.g.: `jupyter noteboook`. In the following, we describe the `.ipynb`-files in the folder `\notebooks` to reproduce the figures from our paper.
161
 
162
  #### [Fig. 4 and fig. 5 Representativeness](notebooks/gedi_figs4and5_representativeness.ipynb)
163
- To visualize the coverage of the feasible feature space of generated event logs compared to existing real-world benchmark datasets, in this notebook, we conduct a principal component analysis on the features of both settings. The first two principal components are utilized to visualize the coverage which is further highlighted by computing a convex hull of the 2D mapping. Additionally, we visualize the distribution of each meta feature we used in the paper as a boxplot. Additional features can be extracted with FEEED. Therefore, the notebook contains the figures 4 and 5 in the paper.
164
 
165
  #### [Fig. 6 Benchmark Boxplots](notebooks/gedi_fig6_benchmark_boxplots.ipynb)
166
  This notebook is used to visualize the metric distribution of real event logs compared to the generated ones. It shows 5 different metrics on 3 various process discovery techniques. We use 'fitness,', 'precision', 'fscore', 'size', 'cfc' (control-flow complexity) as metrics and as 'heuristic miner', 'ilp' (integer linear programming), and 'imf' (inductive miner infrequent) as miners. The notebook outputs the visualization shown in Fig.6 in the paper.
@@ -169,11 +414,14 @@ This notebook is used to visualize the metric distribution of real event logs co
169
 
170
  This notebook is used to answer the question if there is a statistically significant relation between feature similarity and performance metrics for the downstream tasks of process discovery. For that, we compute the pearson coefficient, as well as the kendall's tau coefficient. This elucidates the correlation between the features with metric scores being used for process discovery. Each coefficient is calculated for three different settings: i) real-world datasets; ii) synthesized event log data with real-world targets; iii) synthesized event log data with grid objectives. Figures 7 and 8 shown in the paper refer to this notebook.
171
 
 
 
 
172
  ## Citation
173
- The `GEDI` framework is taken directly from the original paper by [Maldonado](mailto:[email protected]), Frey, Tavares, Rehwald and Seidl and is *to appear on BPM'24*.
174
 
175
- ```bibtex
176
- @InProceedings{10.1007/978-3-031-70396-6_13,
177
  author="Maldonado, Andrea
178
  and Frey, Christian M. M.
179
  and Tavares, Gabriel Marques
@@ -193,3 +441,20 @@ abstract="Process mining solutions include enhancing performance, conserving res
193
  isbn="978-3-031-70396-6"
194
  }
195
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: iGedi
3
+ emoji: πŸŒ–
4
+ colorFrom: indigo
5
+ colorTo: pink
6
+ sdk: streamlit
7
+ sdk_version: 1.38.0
8
+ app_file: utils/config_fabric.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ <p>
14
+ <img src="gedi/utils/logo.png" alt="Logo" width="100" align="left" />
15
+ <h1 style="display: inline;">(i)GEDI</h1>
16
+ </p>
17
+
18
+ (**i**nteractive) **G**enerating **E**vent **D**ata with **I**ntentional Features for Benchmarking Process Mining<br />
19
+ This repository contains the codebase for the interactive web application tool (iGEDI) as well as for the [GEDI paper](https://mcml.ai/publications/gedi.pdf) accepted at the BPM'24 conference.
20
 
21
  ## Table of Contents
22
 
23
+ - [Interactive Web Application (iGEDI)](#interactive-web-application)
24
+ - [Requirements](#requirements)
25
  - [Installation](#installation)
26
  - [General Usage](#general-usage)
27
  - [Experiments](#experiments)
28
  - [Citation](#citation)
29
 
30
+ ## Interactive Web Application
31
+ Our [interactive web application](https://huggingface.co/spaces/andreamalhera/gedi) (iGEDI) guides you through the specification process, runs GEDI for you. You can directly download the resulting generated logs or the configuration file to run GEDI locally.
32
+ ![Interface Screenshot](gedi/utils/iGEDI_interface.png)
33
+
34
  ## Requirements
35
  - [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
36
  - Graphviz on your OS e.g.
 
49
  ### Startup
50
  ```console
51
  conda activate gedi
52
+ python main.py -a config_files/test/experiment_test.json
53
  ```
54
  The last step should take only a few minutes to run.
55
 
 
63
  To run different steps of the GEDI pipeline, please adapt the `.json` accordingly.
64
  ```console
65
  conda activate gedi
66
+ python main.py -a config_files/pipeline_steps/<pipeline-step>.json
67
  ```
68
+ For reference of possible keys and values for each step, please see `config_files/test/experiment_test.json`.
69
  To run the whole pipeline please create a new `.json` file, specifying all steps you want to run and specify desired keys and values for each step.
70
+ To reproduce results from our paper, please refer to [Experiments](#experiments).
71
 
72
  ### Feature Extraction
73
  ---
74
  To extract the features on the event-log level and use them for hyperparameter optimization, we employ the following script:
75
  ```console
76
  conda activate gedi
77
+ python main.py -a config_files/pipeline_steps/feature_extraction.json
78
  ```
79
  The JSON file consists of the following key-value pairs:
80
 
 
85
  - real_eventlog_path: defines the file with the features extracted from the real event logs
86
  - plot_type: defines the style of the output plotting (possible values: violinplot, boxplot)
87
  - font_size: label font size of the output plot
88
+ - boxplot_width: width of the violinplot/boxplot
 
89
 
90
  ### Generation
91
  ---
 
95
 
96
  ```console
97
  conda activate gedi
98
+ python main.py -a config_files/pipeline_steps/generation.json
99
  ```
100
 
101
  In the `generation.json`, we have the following key-value pairs:
 
122
 
123
  - plot_reference_feature: defines the feature, which is used on the x-axis on the output plots, i.e., each feature defined in the 'objectives' of the 'experiment' is plotted against the reference feature being defined in this value
124
 
125
+ In case of manually defining the targets for the features in config space, the following table shows the range of the features in the real-world event log data (BPIC's) for reference:
126
+ <div style="overflow-x:auto;">
127
+ <table border="1" class="dataframe">
128
+ <thead>
129
+ <tr style="text-align: right;">
130
+ <th></th>
131
+ <th>n_traces</th>
132
+ <th>n_unique_traces</th>
133
+ <th>ratio_variants_per_number_of_traces</th>
134
+ <th>trace_len_min</th>
135
+ <th>trace_len_max</th>
136
+ <th>trace_len_mean</th>
137
+ <th>trace_len_median</th>
138
+ <th>trace_len_mode</th>
139
+ <th>trace_len_std</th>
140
+ <th>trace_len_variance</th>
141
+ <th>trace_len_q1</th>
142
+ <th>trace_len_q3</th>
143
+ <th>trace_len_iqr</th>
144
+ <th>trace_len_geometric_mean</th>
145
+ <th>trace_len_geometric_std</th>
146
+ <th>trace_len_harmonic_mean</th>
147
+ <th>trace_len_skewness</th>
148
+ <th>trace_len_kurtosis</th>
149
+ <th>trace_len_coefficient_variation</th>
150
+ <th>trace_len_entropy</th>
151
+ <th>trace_len_hist1</th>
152
+ <th>trace_len_hist2</th>
153
+ <th>trace_len_hist3</th>
154
+ <th>trace_len_hist4</th>
155
+ <th>trace_len_hist5</th>
156
+ <th>trace_len_hist6</th>
157
+ <th>trace_len_hist7</th>
158
+ <th>trace_len_hist8</th>
159
+ <th>trace_len_hist9</th>
160
+ <th>trace_len_hist10</th>
161
+ <th>trace_len_skewness_hist</th>
162
+ <th>trace_len_kurtosis_hist</th>
163
+ <th>ratio_most_common_variant</th>
164
+ <th>ratio_top_1_variants</th>
165
+ <th>ratio_top_5_variants</th>
166
+ <th>ratio_top_10_variants</th>
167
+ <th>ratio_top_20_variants</th>
168
+ <th>ratio_top_50_variants</th>
169
+ <th>ratio_top_75_variants</th>
170
+ <th>mean_variant_occurrence</th>
171
+ <th>std_variant_occurrence</th>
172
+ <th>skewness_variant_occurrence</th>
173
+ <th>kurtosis_variant_occurrence</th>
174
+ <th>n_unique_activities</th>
175
+ <th>activities_min</th>
176
+ <th>activities_max</th>
177
+ <th>activities_mean</th>
178
+ <th>activities_median</th>
179
+ <th>activities_std</th>
180
+ <th>activities_variance</th>
181
+ <th>activities_q1</th>
182
+ <th>activities_q3</th>
183
+ <th>activities_iqr</th>
184
+ <th>activities_skewness</th>
185
+ <th>activities_kurtosis</th>
186
+ <th>n_unique_start_activities</th>
187
+ <th>start_activities_min</th>
188
+ <th>start_activities_max</th>
189
+ <th>start_activities_mean</th>
190
+ <th>start_activities_median</th>
191
+ <th>start_activities_std</th>
192
+ <th>start_activities_variance</th>
193
+ <th>start_activities_q1</th>
194
+ <th>start_activities_q3</th>
195
+ <th>start_activities_iqr</th>
196
+ <th>start_activities_skewness</th>
197
+ <th>start_activities_kurtosis</th>
198
+ <th>n_unique_end_activities</th>
199
+ <th>end_activities_min</th>
200
+ <th>end_activities_max</th>
201
+ <th>end_activities_mean</th>
202
+ <th>end_activities_median</th>
203
+ <th>end_activities_std</th>
204
+ <th>end_activities_variance</th>
205
+ <th>end_activities_q1</th>
206
+ <th>end_activities_q3</th>
207
+ <th>end_activities_iqr</th>
208
+ <th>end_activities_skewness</th>
209
+ <th>end_activities_kurtosis</th>
210
+ <th>eventropy_trace</th>
211
+ <th>eventropy_prefix</th>
212
+ <th>eventropy_global_block</th>
213
+ <th>eventropy_lempel_ziv</th>
214
+ <th>eventropy_k_block_diff_1</th>
215
+ <th>eventropy_k_block_diff_3</th>
216
+ <th>eventropy_k_block_diff_5</th>
217
+ <th>eventropy_k_block_ratio_1</th>
218
+ <th>eventropy_k_block_ratio_3</th>
219
+ <th>eventropy_k_block_ratio_5</th>
220
+ <th>eventropy_knn_3</th>
221
+ <th>eventropy_knn_5</th>
222
+ <th>eventropy_knn_7</th>
223
+ <th>epa_variant_entropy</th>
224
+ <th>epa_normalized_variant_entropy</th>
225
+ <th>epa_sequence_entropy</th>
226
+ <th>epa_normalized_sequence_entropy</th>
227
+ <th>epa_sequence_entropy_linear_forgetting</th>
228
+ <th>epa_normalized_sequence_entropy_linear_forgetting</th>
229
+ <th>epa_sequence_entropy_exponential_forgetting</th>
230
+ <th>epa_normalized_sequence_entropy_exponential_forgetting</th>
231
+ </tr>
232
+ </thead>
233
+ <tbody>
234
+ <tr>
235
+ <td>[ min, max ]</td>
236
+ <td>[ 226.0, 251734.0 ]</td>
237
+ <td>[ 6.0, 28457.0 ]</td>
238
+ <td>[ 0.0, 1.0 ]</td>
239
+ <td>[ 1.0, 24.0 ]</td>
240
+ <td>[ 1.0, 2973.0 ]</td>
241
+ <td>[ 1.0, 131.49 ]</td>
242
+ <td>[ 1.0, 55.0 ]</td>
243
+ <td>[ 1.0, 61.0 ]</td>
244
+ <td>[ 0.0, 202.53 ]</td>
245
+ <td>[ 0.0, 41017.89 ]</td>
246
+ <td>[ 1.0, 44.0 ]</td>
247
+ <td>[ 1.0, 169.0 ]</td>
248
+ <td>[ 0.0, 161.0 ]</td>
249
+ <td>[ 1.0, 53.78 ]</td>
250
+ <td>[ 1.0, 5.65 ]</td>
251
+ <td>[ 1.0, 51.65 ]</td>
252
+ <td>[ -0.58, 111.97 ]</td>
253
+ <td>[ -0.97, 14006.75 ]</td>
254
+ <td>[ 0.0, 4.74 ]</td>
255
+ <td>[ 5.33, 12.04 ]</td>
256
+ <td>[ 0.0, 1.99 ]</td>
257
+ <td>[ 0.0, 0.42 ]</td>
258
+ <td>[ 0.0, 0.4 ]</td>
259
+ <td>[ 0.0, 0.19 ]</td>
260
+ <td>[ 0.0, 0.14 ]</td>
261
+ <td>[ 0.0, 10.0 ]</td>
262
+ <td>[ 0.0, 0.02 ]</td>
263
+ <td>[ 0.0, 0.04 ]</td>
264
+ <td>[ 0.0, 0.0 ]</td>
265
+ <td>[ 0.0, 2.7 ]</td>
266
+ <td>[ -0.58, 111.97 ]</td>
267
+ <td>[ -0.97, 14006.75 ]</td>
268
+ <td>[ 0.0, 0.79 ]</td>
269
+ <td>[ 0.0, 0.87 ]</td>
270
+ <td>[ 0.0, 0.98 ]</td>
271
+ <td>[ 0.0, 0.99 ]</td>
272
+ <td>[ 0.2, 1.0 ]</td>
273
+ <td>[ 0.5, 1.0 ]</td>
274
+ <td>[ 0.75, 1.0 ]</td>
275
+ <td>[ 1.0, 24500.67 ]</td>
276
+ <td>[ 0.04, 42344.04 ]</td>
277
+ <td>[ 1.54, 64.77 ]</td>
278
+ <td>[ 0.66, 5083.46 ]</td>
279
+ <td>[ 1.0, 1152.0 ]</td>
280
+ <td>[ 1.0, 66058.0 ]</td>
281
+ <td>[ 34.0, 466141.0 ]</td>
282
+ <td>[ 4.13, 66058.0 ]</td>
283
+ <td>[ 2.0, 66058.0 ]</td>
284
+ <td>[ 0.0, 120522.25 ]</td>
285
+ <td>[ 0.0, 14525612122.34 ]</td>
286
+ <td>[ 1.0, 66058.0 ]</td>
287
+ <td>[ 4.0, 79860.0 ]</td>
288
+ <td>[ 0.0, 77290.0 ]</td>
289
+ <td>[ -0.06, 15.21 ]</td>
290
+ <td>[ -1.5, 315.84 ]</td>
291
+ <td>[ 1.0, 809.0 ]</td>
292
+ <td>[ 1.0, 150370.0 ]</td>
293
+ <td>[ 27.0, 199867.0 ]</td>
294
+ <td>[ 3.7, 150370.0 ]</td>
295
+ <td>[ 1.0, 150370.0 ]</td>
296
+ <td>[ 0.0, 65387.49 ]</td>
297
+ <td>[ 0.0, 4275524278.19 ]</td>
298
+ <td>[ 1.0, 150370.0 ]</td>
299
+ <td>[ 4.0, 150370.0 ]</td>
300
+ <td>[ 0.0, 23387.25 ]</td>
301
+ <td>[ 0.0, 9.3 ]</td>
302
+ <td>[ -2.0, 101.82 ]</td>
303
+ <td>[ 1.0, 757.0 ]</td>
304
+ <td>[ 1.0, 16653.0 ]</td>
305
+ <td>[ 28.0, 181328.0 ]</td>
306
+ <td>[ 3.53, 24500.67 ]</td>
307
+ <td>[ 1.0, 16653.0 ]</td>
308
+ <td>[ 0.0, 42344.04 ]</td>
309
+ <td>[ 0.0, 1793017566.89 ]</td>
310
+ <td>[ 1.0, 16653.0 ]</td>
311
+ <td>[ 3.0, 39876.0 ]</td>
312
+ <td>[ 0.0, 39766.0 ]</td>
313
+ <td>[ -0.7, 13.82 ]</td>
314
+ <td>[ -2.0, 255.39 ]</td>
315
+ <td>[ 0.0, 13.36 ]</td>
316
+ <td>[ 0.0, 16.77 ]</td>
317
+ <td>[ 0.0, 24.71 ]</td>
318
+ <td>[ 0.0, 685.0 ]</td>
319
+ <td>[ -328.0, 962.0 ]</td>
320
+ <td>[ 0.0, 871.0 ]</td>
321
+ <td>[ 0.0, 881.0 ]</td>
322
+ <td>[ 0.0, 935.0 ]</td>
323
+ <td>[ 0.0, 7.11 ]</td>
324
+ <td>[ 0.0, 7.11 ]</td>
325
+ <td>[ 0.0, 8.93 ]</td>
326
+ <td>[ 0.0, 648.0 ]</td>
327
+ <td>[ 0.0, 618.0 ]</td>
328
+ <td>[ 0.0, 11563842.15 ]</td>
329
+ <td>[ 0.0, 0.9 ]</td>
330
+ <td>[ 0.0, 21146257.12 ]</td>
331
+ <td>[ 0.0, 0.76 ]</td>
332
+ <td>[ 0.0, 14140225.9 ]</td>
333
+ <td>[ 0.0, 0.42 ]</td>
334
+ <td>[ 0.0, 15576076.83 ]</td>
335
+ <td>[ 0.0, 0.51 ]</td>
336
+ </tr>
337
+ </tbody>
338
+ </table>
339
+ </div>
340
+
341
  ### Benchmark
342
  The benchmarking defines the downstream task which is used for evaluating the goodness of the synthesized event log datasets with the metrics of real-world datasets. The command to execute a benchmarking is shown in the following script:
343
 
344
  ```console
345
  conda activate gedi
346
+ python main.py -a config_files/pipeline_steps/benchmark.json
347
  ```
348
 
349
  In the `benchmark.json`, we have the following key-value pairs:
 
361
 
362
  ```console
363
  conda activate gedi
364
+ python main.py -a config_files/pipeline_steps/evaluation_plotter.json
365
  ```
366
 
367
  Generally, in the `evaluation_plotter.json`, we have the following key-value pairs:
 
377
  We present two settings for generating intentional event logs, using [real targets](#generating-data-with-real-targets) or using [grid targets](#generating-data-with-grid-targets). Both settings output `.xes` event logs, `.json` and `.csv` files containing feature values, as well as evaluation results, from running a [process discovery benchmark](#benchmark), for the generated event logs.
378
 
379
  ### Generating data with real targets
380
+ To execute the experiments with real targets, we employ the [experiment_real_targets.json](config_files/experiment_real_targets.json). The script's pipeline will output the [generated event logs (GenBaselineED)](data/event_logs/GenBaselineED), which optimize their feature values towards [real-world event data features](data/BaselineED_feat.csv), alongside their respectively measured [feature values](data/GenBaselineED_feat.csv) and [benchmark metrics values](data/GenBaselineED_bench.csv).
381
 
382
  ```console
383
  conda activate gedi
384
+ python main.py -a config_files/experiment_real_targets.json
385
  ```
386
 
387
  ### Generating data with grid targets
388
+ To execute the experiments with grid targets, a single [configuration](config_files/grid_2obj) can be selected or all [grid objectives](data/grid_2obj) can be run with one command using the following script. This script will output the [generated event logs (GenED)](data/event_logs/GenED), alongside their respectively measured [feature values](data/GenED_feat.csv) and [benchmark metrics values](data/GenED_bench.csv).
389
  ```
390
  conda activate gedi
391
+ python gedi/utils/execute_grid_experiments.py config_files/test
392
+ ```
393
+ We employ the [experiment_grid_2obj_configfiles_fabric.ipynb](notebooks/experiment_grid_2obj_configfiles_fabric.ipynb) to create all necessary [configuration](config_files/grid_2obj) and [objective](data/grid_2obj) files for this experiment.
394
+ For more details about these config_files, please refer to [Feature Extraction](#feature-extraction), [Generation](#generation), and [Benchmark](#benchmark).
395
+ To create configuration files for grid objectives interactively, you can use the start the following dashboard:
396
+ ```
397
+ streamlit run utils/config_fabric.py # To tunnel to local machine add: --server.port 8501 --server.headless true
398
+
399
+ # In local machine (only in case you are tunneling):
400
+ ssh -N -f -L 9000:localhost:8501 <user@remote_machine.com>
401
+ open "http://localhost:9000/"
402
  ```
 
403
 
404
  ### Visualizations
405
  To run the visualizations, we employ [jupyter notebooks](https://jupyter.org/install) and [add the installed environment to the jupyter notebook](https://medium.com/@nrk25693/how-to-add-your-conda-environment-to-your-jupyter-notebook-in-just-4-steps-abeab8b8d084). We then start all visualizations by running e.g.: `jupyter noteboook`. In the following, we describe the `.ipynb`-files in the folder `\notebooks` to reproduce the figures from our paper.
406
 
407
  #### [Fig. 4 and fig. 5 Representativeness](notebooks/gedi_figs4and5_representativeness.ipynb)
408
+ To visualize the coverage of the feasible feature space of generated event logs compared to existing real-world benchmark datasets, in this notebook, we conduct a principal component analysis on the features of both settings. The first two principal components are utilized to visualize the coverage which is further highlighted by computing a convex hull of the 2D mapping.Additionally, we visualize the distribution of each meta feature we used in the paper as a boxplot. Additional features can be extracted with FEEED. Therefore, the notebook contains the figures 4 and 5 in the paper.
409
 
410
  #### [Fig. 6 Benchmark Boxplots](notebooks/gedi_fig6_benchmark_boxplots.ipynb)
411
  This notebook is used to visualize the metric distribution of real event logs compared to the generated ones. It shows 5 different metrics on 3 various process discovery techniques. We use 'fitness,', 'precision', 'fscore', 'size', 'cfc' (control-flow complexity) as metrics and as 'heuristic miner', 'ilp' (integer linear programming), and 'imf' (inductive miner infrequent) as miners. The notebook outputs the visualization shown in Fig.6 in the paper.
 
414
 
415
  This notebook is used to answer the question if there is a statistically significant relation between feature similarity and performance metrics for the downstream tasks of process discovery. For that, we compute the pearson coefficient, as well as the kendall's tau coefficient. This elucidates the correlation between the features with metric scores being used for process discovery. Each coefficient is calculated for three different settings: i) real-world datasets; ii) synthesized event log data with real-world targets; iii) synthesized event log data with grid objectives. Figures 7 and 8 shown in the paper refer to this notebook.
416
 
417
+ #### [Fig. 9 Consistency and fig. 10 Limitations](notebooks/gedi_figs9and10_consistency.ipynb)
418
+ Likewise to the evaluation on the statistical tests in notebook `gedi_figs7and8_benchmarking_statisticalTests.ipynb`, this notebook is used to compute the differences between two correlation matrices $\Delta C = C_1 - C_2$. This logic is employed to evaluate and visualize the distance of two correlation matrices. Furthermore, we show how significant scores are retained from the correlations being evaluated on real-world datasets coompared to synthesized event log datasets with real-world targets. In Fig. 9 and 10 in the paper, the results of the notebook are shown.
419
+
420
  ## Citation
421
+ The `GEDI` framework is taken directly from the original paper by [Maldonado](mailto:[email protected]), Frey, Tavares, Rehwald and Seidl on BPM'24.
422
 
423
+ ```
424
+ @InProceedings{maldonado2024gedi,
425
  author="Maldonado, Andrea
426
  and Frey, Christian M. M.
427
  and Tavares, Gabriel Marques
 
441
  isbn="978-3-031-70396-6"
442
  }
443
  ```
444
+
445
+ Furthermore, the `iGEDI` web application is taken directly from the original paper by [Maldonado](mailto:[email protected]), Aryasomayajula, Frey, and Seidl and is *to appear on Demos@ICPM'24*.
446
+ ```
447
+ @inproceedings{maldonado2024igedi,
448
+ author = {Andrea Maldonado and
449
+ Sai Anirudh Aryasomayajula and
450
+ Christian M. M. Frey and
451
+ Thomas Seidl},
452
+ editor = {Jochen De Weerdt, Giovanni Meroni, Han van der Aa, and Karolin Winter},
453
+ title = {iGEDI: interactive Generating Event Data with Intentional Features},
454
+ booktitle = {ICPM 2024 Tool Demonstration Track, October 14-18, 2024, Kongens Lyngby, Denmark},
455
+ series = {{CEUR} Workshop Proceedings},
456
+ publisher = {CEUR-WS.org},
457
+ year = {2024},
458
+ bibsource = {dblp computer science bibliography, https://dblp.org}
459
+ }
460
+ ```
config.py CHANGED
@@ -1,10 +1,8 @@
1
  import json
2
- import os
3
  import warnings
4
 
5
- from gedi.utils.io_helpers import sort_files
6
- from tqdm import tqdm
7
- from utils.param_keys import INPUT_NAME, FILENAME, FOLDER_PATH, PARAMS
8
 
9
  def get_model_params_list(alg_json_file: str) :#-> list[dict]:
10
  """
@@ -20,69 +18,8 @@ def get_model_params_list(alg_json_file: str) :#-> list[dict]:
20
  warnings.warn('The default model parameter list is used instead of a .json-file.\n'
21
  ' Use a configuration from the `config_files`-folder together with the args `-a`.')
22
  return [
23
- {ALGORITHM_NAME: 'pca', NDIM: TENSOR_NDIM},
 
 
 
24
  ]
25
- def get_run_params(alg_params_json: str) -> dict:
26
- """
27
- Loads the running configuration given from a json file or the default dictionary from the code.
28
- @param alg_params_json: str
29
- Path to the json data with the running configuration
30
- @return: dict
31
- Running Configuration
32
- """
33
- if alg_params_json is not None:
34
- return json.load(open(alg_params_json))
35
- else:
36
- warnings.warn('The default run option is used instead of a .json-file.\n'
37
- ' Use a configuration from the `config_files`-folder together with the args `-o`.')
38
- return {
39
- RUN_OPTION: COMPARE,
40
- PLOT_TYPE: COLOR_MAP, # 'heat_map', 'color_map', '3d_map', 'explained_var_plot'
41
- PLOT_TICS: True,
42
- N_COMPONENTS: 2,
43
- INPUT_NAME: 'runningExample',
44
- SAVE_RESULTS: True,
45
- LOAD_RESULTS: True
46
- }
47
-
48
- def get_files_and_kwargs(params: dict):
49
- """
50
- This method returns the filename list of the trajectory and generates the kwargs for the DataTrajectory.
51
- The method is individually created for the available data set.
52
- Add new trajectory options, if different data set are used.
53
- @param params: dict
54
- running configuration
55
- @return: tuple
56
- list of filenames of the trajectories AND
57
- kwargs with the important arguments for the classes
58
- """
59
- try:
60
- input_name = params[INPUT_NAME]
61
- except KeyError as e:
62
- raise KeyError(f'Run option parameter is missing the key: `{e}`. This parameter is mandatory.')
63
-
64
- #TODO: generate parent directories if they don't exist
65
- if input_name == 'test':
66
- filename_list = list(tqdm(sort_files(os.listdir('data/test'))))
67
- kwargs = {FILENAME: filename_list, FOLDER_PATH: 'data/test'}
68
- elif input_name == 'realLogs':
69
- filename_list = list(tqdm(sort_files(os.listdir('data/real_event_logs'))))
70
- kwargs = {FILENAME: filename_list, FOLDER_PATH: 'data/real_event_logs'}
71
- elif input_name == 'gen5':
72
- filename_list = list(tqdm(sort_files(os.listdir('data/event_log'))))[:5]
73
- kwargs = {FILENAME: filename_list, FOLDER_PATH: 'data/event_log'}
74
- elif input_name == 'gen20':
75
- filename_list = list(tqdm(sort_files(os.listdir('data/event_log'))))[:20]
76
- kwargs = {FILENAME: filename_list, FOLDER_PATH: 'data/event_log'}
77
- elif input_name == 'runningExample':
78
- filename_list = ['running-example.xes']
79
- kwargs = {FILENAME: filename_list[0], FOLDER_PATH: 'data/'}
80
- elif input_name == 'metaFeatures':
81
- filename_list = ['log_features.csv']
82
- kwargs = {FILENAME: filename_list[0], FOLDER_PATH: 'results/'}
83
- else:
84
- raise ValueError(f'No data trajectory was found with the name `{input_name}`.')
85
-
86
- #filename_list.pop(file_element)
87
- kwargs[PARAMS] = params
88
- return filename_list, kwargs
 
1
  import json
 
2
  import warnings
3
 
4
+ from utils.param_keys import PIPELINE_STEP, INPUT_PATH, OUTPUT_PATH
5
+ from utils.param_keys.features import FEATURE_SET, FEATURE_PARAMS
 
6
 
7
  def get_model_params_list(alg_json_file: str) :#-> list[dict]:
8
  """
 
18
  warnings.warn('The default model parameter list is used instead of a .json-file.\n'
19
  ' Use a configuration from the `config_files`-folder together with the args `-a`.')
20
  return [
21
+ {PIPELINE_STEP: 'feature_extraction', INPUT_PATH: 'data/test',
22
+ FEATURE_PARAMS: {FEATURE_SET: ['ratio_unique_traces_per_trace',
23
+ 'ratio_most_common_variant']},
24
+ OUTPUT_PATH: 'output/plots'}
25
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config_files/config_layout.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "pipeline_step": "instance_augmentation",
4
+ "augmentation_params":{"method":"SMOTE", "no_samples":2,
5
+ "feature_selection": ["ratio_top_20_variants", "epa_normalized_sequence_entropy_linear_forgetting"]},
6
+ "input_path": "data/test/bpic_features.csv",
7
+ "output_path": "output"
8
+ },
9
+ {
10
+ "pipeline_step": "event_logs_generation",
11
+ "output_path": "output/features/2_bpic_features/2_ense_rmcv_feat.csv",
12
+ "output_path": "data/frontend/test",
13
+ "generator_params": {
14
+ "experiment": "data/grid_objectives.csv",
15
+ "experiment": {"input_path": "data/2_bpic_features.csv",
16
+ "objectives": ["ratio_top_20_variants", "epa_normalized_sequence_entropy_linear_forgetting"]},
17
+ "experiment": {"n_traces":832, "n_unique_traces":828, "ratio_variants_per_number_of_traces":0.99, "trace_len_min":1, "trace_len_max":132, "trace_len_mean":53.31, "trace_len_median":54, "trace_len_mode":61, "trace_len_std":19.89, "trace_len_variance":395.81, "trace_len_q1":44, "trace_len_q3":62, "trace_len_iqr":18, "trace_len_geometric_mean":48.15, "trace_len_geometric_std":1.69, "trace_len_harmonic_mean":37.58, "trace_len_skewness":0.0541, "trace_len_kurtosis":0.81, "trace_len_coefficient_variation":0.37, "trace_len_entropy":6.65, "trace_len_hist1":0.004, "trace_len_hist2":0.005, "trace_len_hist3":0.005, "trace_len_hist4":0.024, "trace_len_hist5":0.024, "trace_len_hist6":0.008, "trace_len_hist7":0.005, "trace_len_hist8":0.001, "trace_len_hist9":0.0, "trace_len_hist10":0.00, "trace_len_skewness_hist":0.05, "trace_len_kurtosis_hist":0.8, "ratio_most_common_variant":0.0, "ratio_top_1_variants":0.01, "ratio_top_5_variants":0.05, "ratio_top_10_variants":0.10, "ratio_top_20_variants":0.2, "ratio_top_50_variants":0.5, "ratio_top_75_variants":0.75, "mean_variant_occurrence":1.0, "std_variant_occurrence":0.07, "skewness_variant_occurrence":14.28, "kurtosis_variant_occurrence":202.00, "n_unique_activities":410, "activities_min":1, "activities_max":830, "activities_mean":108.18, "activities_median":12, "activities_std":187.59, "activities_variance":35189, "activities_q1":3, "activities_q3":125, "activities_iqr":122, "activities_skewness":2.13, "activities_kurtosis":3.81, "n_unique_start_activities":14, "start_activities_min":1, "start_activities_max":731, "start_activities_mean":59.43, "start_activities_median":1, "start_activities_std":186.72, "start_activities_variance":34863, "start_activities_q1":1, "start_activities_q3":8, "start_activities_iqr":7, "start_activities_skewness":3, "start_activities_kurtosis":9.0, "n_unique_end_activities":82, "end_activities_min":1, "end_activities_max":216, "end_activities_mean":10, "end_activities_median":1, "end_activities_std":35, "end_activities_variance":1247, "end_activities_q1":1, "end_activities_q3":3, "end_activities_iqr":2, "end_activities_skewness":5, "end_activities_kurtosis":26, "eventropy_trace":10, "eventropy_prefix":15, "eventropy_global_block":19, "eventropy_lempel_ziv":4, "eventropy_k_block_diff_1":7.1, "eventropy_k_block_diff_3":7.1, "eventropy_k_block_diff_5":7.1, "eventropy_k_block_ratio_1":7.1, "eventropy_k_block_ratio_3":7.1, "eventropy_k_block_ratio_5":7.1, "eventropy_knn_3":5.54, "eventropy_knn_5":5.04, "eventropy_knn_7":4.72, "epa_variant_entropy":240512, "epa_normalized_variant_entropy":0.68, "epa_sequence_entropy":285876, "epa_normalized_sequence_entropy":0.60, "epa_sequence_entropy_linear_forgetting":150546, "epa_normalized_sequence_entropy_linear_forgetting":0.32, "epa_sequence_entropy_exponential_forgetting":185312, "epa_normalized_sequence_entropy_exponential_forgetting":0.39},
18
+ "config_space": {
19
+ "mode": [5, 20],
20
+ "sequence": [0.01, 1],
21
+ "choice": [0.01, 1],
22
+ "parallel": [0.01, 1],
23
+ "loop": [0.01, 1],
24
+ "silent": [0.01, 1],
25
+ "lt_dependency": [0.01, 1],
26
+ "num_traces": [10, 100],
27
+ "duplicate": [0],
28
+ "or": [0]
29
+ },
30
+ "n_trials": 50
31
+ }
32
+ },
33
+ {
34
+ "pipeline_step": "feature_extraction",
35
+ "input_path": "data/test",
36
+ "feature_params": {"feature_set": ["n_traces", "n_unique_traces", "ratio_unique_traces_per_trace", "trace_len_min", "trace_len_max", "trace_len_mean", "trace_len_median", "trace_len_mode", "trace_len_std", "trace_len_variance", "trace_len_q1", "trace_len_q3", "trace_len_iqr", "trace_len_geometric_mean", "trace_len_geometric_std", "trace_len_harmonic_mean", "trace_len_skewness", "trace_len_kurtosis", "trace_len_coefficient_variation", "trace_len_entropy", "trace_len_hist1", "trace_len_hist2", "trace_len_hist3", "trace_len_hist4", "trace_len_hist5", "trace_len_hist6", "trace_len_hist7", "trace_len_hist8", "trace_len_hist9", "trace_len_hist10", "trace_len_skewness_hist", "trace_len_kurtosis_hist", "ratio_most_common_variant", "ratio_top_1_variants", "ratio_top_5_variants", "ratio_top_10_variants", "ratio_top_20_variants", "ratio_top_50_variants", "ratio_top_75_variants", "mean_variant_occurrence", "std_variant_occurrence", "skewness_variant_occurrence", "kurtosis_variant_occurrence", "n_unique_activities", "activities_min", "activities_max", "activities_mean", "activities_median", "activities_std", "activities_variance", "activities_q1", "activities_q3", "activities_iqr", "activities_skewness", "activities_kurtosis", "n_unique_start_activities", "start_activities_min", "start_activities_max", "start_activities_mean", "start_activities_median", "start_activities_std", "start_activities_variance", "start_activities_q1", "start_activities_q3", "start_activities_iqr", "start_activities_skewness", "start_activities_kurtosis", "n_unique_end_activities", "end_activities_min", "end_activities_max", "end_activities_mean", "end_activities_median", "end_activities_std", "end_activities_variance", "end_activities_q1", "end_activities_q3", "end_activities_iqr", "end_activities_skewness", "end_activities_kurtosis", "eventropy_trace", "eventropy_prefix", "eventropy_prefix_flattened", "eventropy_global_block", "eventropy_global_block_flattened", "eventropy_lempel_ziv", "eventropy_lempel_ziv_flattened", "eventropy_k_block_diff_1", "eventropy_k_block_diff_3", "eventropy_k_block_diff_5", "eventropy_k_block_ratio_1", "eventropy_k_block_ratio_3", "eventropy_k_block_ratio_5", "eventropy_knn_3", "eventropy_knn_5", "eventropy_knn_7", "epa_variant_entropy", "epa_normalized_variant_entropy", "epa_sequence_entropy", "epa_normalized_sequence_entropy", "epa_sequence_entropy_linear_forgetting", "epa_normalized_sequence_entropy_linear_forgetting", "epa_sequence_entropy_exponential_forgetting", "epa_normalized_sequence_entropy_exponential_forgetting"]},
37
+ "output_path": "output/plots",
38
+ "real_eventlog_path": "data/BaselineED_feat.csv",
39
+ "plot_type": "boxplot"
40
+ },
41
+ {
42
+ "pipeline_step": "benchmark_test",
43
+ "benchmark_task": "discovery",
44
+ "input_path":"data/test",
45
+ "output_path":"output",
46
+ "miners" : ["inductive", "heu", "imf", "ilp"]
47
+ }
48
+ ]
config_files/{algorithm/experiment_real_targets.json β†’ experiment_real_targets.json} RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_ense_enseef.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_ense_enself.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_ense_enve.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_ense_rmcv.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_ense_rt10v.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_ense_rvpnot.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enseef_enself.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enseef_enve.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enseef_rmcv.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enseef_rt10v.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enseef_rvpnot.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enself_enve.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enself_rmcv.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enself_rt10v.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enself_rvpnot.json RENAMED
File without changes
config_files/grid_2obj/generator_grid_2objectives_enve_mvo.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [{"pipeline_step": "event_logs_generation", "output_path": "output/shaining/grid_2obj", "generator_params": {"experiment": {"input_path": "data/grid_2obj/grid_2objectives_enve_mvo.csv", "objectives": ["epa_normalized_variant_entropy", "mean_variant_occurrence"]}, "config_space": {"mode": [5, 20], "sequence": [0.01, 1], "choice": [0.01, 1], "parallel": [0.01, 1], "loop": [0.01, 1], "silent": [0.01, 1], "lt_dependency": [0.01, 1], "num_traces": [10, 10001], "duplicate": [0], "or": [0]}, "n_trials": 200}}, {"pipeline_step": "feature_extraction", "input_path": "output/features/shaining/grid_2obj/grid_2objectives_enve_mvo/2_enve_mvo", "feature_params": {"feature_set": ["ratio_variants_per_number_of_traces", "ratio_most_common_variant", "ratio_top_10_variants", "epa_normalized_variant_entropy", "epa_normalized_sequence_entropy", "epa_normalized_sequence_entropy_linear_forgetting", "epa_normalized_sequence_entropy_exponential_forgetting"]}, "output_path": "output/plots", "real_eventlog_path": "data/BaselineED_feat.csv", "plot_type": "boxplot"}, {"pipeline_step": "benchmark_test", "benchmark_test": "discovery", "input_path": "output/shaining/grid_2obj/grid_2objectives_enve_mvo/2_enve_mvo", "output_path": "output", "miners": ["heu", "imf", "ilp"]}]
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enve_rmcv.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enve_rt10v.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_enve_rvpnot.json RENAMED
File without changes
config_files/grid_2obj/generator_grid_2objectives_enve_sam.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [{"pipeline_step": "event_logs_generation", "output_path": "output/shaining/grid_2obj", "generator_params": {"experiment": {"input_path": "data/grid_2obj/grid_2objectives_enve_sam.csv", "objectives": ["epa_normalized_variant_entropy", "start_activities_median"]}, "config_space": {"mode": [5, 20], "sequence": [0.01, 1], "choice": [0.01, 1], "parallel": [0.01, 1], "loop": [0.01, 1], "silent": [0.01, 1], "lt_dependency": [0.01, 1], "num_traces": [10, 10001], "duplicate": [0], "or": [0]}, "n_trials": 200}}, {"pipeline_step": "feature_extraction", "input_path": "output/features/shaining/grid_2obj/grid_2objectives_enve_sam/2_enve_sam", "feature_params": {"feature_set": ["ratio_variants_per_number_of_traces", "ratio_most_common_variant", "ratio_top_10_variants", "epa_normalized_variant_entropy", "epa_normalized_sequence_entropy", "epa_normalized_sequence_entropy_linear_forgetting", "epa_normalized_sequence_entropy_exponential_forgetting"]}, "output_path": "output/plots", "real_eventlog_path": "data/BaselineED_feat.csv", "plot_type": "boxplot"}, {"pipeline_step": "benchmark_test", "benchmark_test": "discovery", "input_path": "output/shaining/grid_2obj/grid_2objectives_enve_sam/2_enve_sam", "output_path": "output", "miners": ["heu", "imf", "ilp"]}]
config_files/grid_2obj/generator_grid_2objectives_mvo_sam.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [{"pipeline_step": "event_logs_generation", "output_path": "output/shaining/grid_2obj", "generator_params": {"experiment": {"input_path": "data/grid_2obj/grid_2objectives_mvo_sam.csv", "objectives": ["mean_variant_occurrence", "start_activities_median"]}, "config_space": {"mode": [5, 20], "sequence": [0.01, 1], "choice": [0.01, 1], "parallel": [0.01, 1], "loop": [0.01, 1], "silent": [0.01, 1], "lt_dependency": [0.01, 1], "num_traces": [10, 10001], "duplicate": [0], "or": [0]}, "n_trials": 200}}, {"pipeline_step": "feature_extraction", "input_path": "output/features/shaining/grid_2obj/grid_2objectives_mvo_sam/2_mvo_sam", "feature_params": {"feature_set": ["ratio_variants_per_number_of_traces", "ratio_most_common_variant", "ratio_top_10_variants", "epa_normalized_variant_entropy", "epa_normalized_sequence_entropy", "epa_normalized_sequence_entropy_linear_forgetting", "epa_normalized_sequence_entropy_exponential_forgetting"]}, "output_path": "output/plots", "real_eventlog_path": "data/BaselineED_feat.csv", "plot_type": "boxplot"}, {"pipeline_step": "benchmark_test", "benchmark_test": "discovery", "input_path": "output/shaining/grid_2obj/grid_2objectives_mvo_sam/2_mvo_sam", "output_path": "output", "miners": ["heu", "imf", "ilp"]}]
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_rmcv_rt10v.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_rmcv_rvpnot.json RENAMED
File without changes
config_files/{algorithm/grid_2obj β†’ grid_2obj}/generator_grid_2objectives_rt10v_rvpnot.json RENAMED
File without changes
config_files/options/baseline.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "run_option": "baseline",
3
- "plot_type": "color_map",
4
- "plot_tics": true,
5
- "n_components": 2,
6
- "input_name": "test",
7
- "save_results": false,
8
- "load_results": false
9
- }
 
 
 
 
 
 
 
 
 
 
config_files/options/run_params.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "run_option": "compare",
3
- "plot_type": "color_map",
4
- "plot_tics": true,
5
- "n_components": 2,
6
- "input_name": "gen20",
7
- "save_results": false,
8
- "load_results": true
9
- }
 
 
 
 
 
 
 
 
 
 
config_files/{algorithm/pipeline_steps β†’ pipeline_steps}/augmentation.json RENAMED
File without changes
config_files/{algorithm/pipeline_steps β†’ pipeline_steps}/benchmark.json RENAMED
@@ -4,6 +4,6 @@
4
  "benchmark_test": "discovery",
5
  "input_path":"data/test",
6
  "output_path":"output",
7
- "miners" : ["inductive", "heu", "imf", "ilp"]
8
  }
9
  ]
 
4
  "benchmark_test": "discovery",
5
  "input_path":"data/test",
6
  "output_path":"output",
7
+ "miners" : ["ind", "heu", "imf", "ilp"]
8
  }
9
  ]
config_files/{algorithm/pipeline_steps β†’ pipeline_steps}/evaluation_plotter.json RENAMED
File without changes
config_files/{algorithm/pipeline_steps β†’ pipeline_steps}/feature_extraction.json RENAMED
File without changes
config_files/{algorithm/pipeline_steps β†’ pipeline_steps}/generation.json RENAMED
File without changes
config_files/{algorithm β†’ test}/experiment_test.json RENAMED
File without changes
config_files/{algorithm/test β†’ test}/generator_2bpic_2objectives_ense_enseef.json RENAMED
File without changes
config_files/{algorithm/test β†’ test}/generator_grid_1objectives_rt10v.json RENAMED
File without changes
config_files/{algorithm/test β†’ test}/generator_grid_2objectives_ense_enself.json RENAMED
File without changes
config_files/test/test_abbrv_generation.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [{"pipeline_step": "event_logs_generation",
2
+ "output_path": "output/test",
3
+ "generator_params": {"experiment":
4
+ {"input_path": "data/test/igedi_table_1.csv",
5
+ "objectives": ["rmcv","ense"]},
6
+ "config_space": {"mode": [5, 20], "sequence": [0.01, 1],
7
+ "choice": [0.01, 1], "parallel": [0.01, 1], "loop": [0.01, 1],
8
+ "silent": [0.01, 1], "lt_dependency": [0.01, 1],
9
+ "num_traces": [10, 10001], "duplicate": [0],
10
+ "or": [0]}, "n_trials": 2}},
11
+ {"pipeline_step": "feature_extraction",
12
+ "input_path": "output/test/igedi_table_1/2_ense_rmcv",
13
+ "feature_params": {"feature_set": ["simple_stats", "trace_length", "trace_variant",
14
+ "activities", "start_activities", "end_activities", "eventropies", "epa_based"]},
15
+ "output_path": "output/plots", "real_eventlog_path": "data/test/2_bpic_features.csv",
16
+ "plot_type": "boxplot"}]
data/test/grid_experiments/rt10v.csv DELETED
@@ -1,12 +0,0 @@
1
- task,ratio_top_10_variants
2
- task_1,0.0
3
- task_2,0.1
4
- task_3,0.2
5
- task_4,0.3
6
- task_5,0.4
7
- task_6,0.5
8
- task_7,0.6
9
- task_8,0.7
10
- task_9,0.8
11
- task_10,0.9
12
- task_11,1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
data/test/grid_feat.csv CHANGED
@@ -1,3 +1,5 @@
1
  log,ratio_top_20_variants,epa_normalized_sequence_entropy_linear_forgetting
2
  experiment1,0.2,0.4
3
  experiment2,0.4,0.7
 
 
 
1
  log,ratio_top_20_variants,epa_normalized_sequence_entropy_linear_forgetting
2
  experiment1,0.2,0.4
3
  experiment2,0.4,0.7
4
+ experiment3,NaN,0.4
5
+ experiment4,0.2,NaN
data/test/igedi_table_1.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ log,rmcv,ense
2
+ BPIC15f4,0.003,0.604
3
+ RTFMP,0.376,0.112
4
+ HD,0.517,0.254
data/validation/2_ense_rmcv_feat.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ log,n_traces,n_unique_traces,trace_len_coefficient_variation,trace_len_entropy,trace_len_geometric_mean,trace_len_geometric_std,trace_len_harmonic_mean,trace_len_hist1,trace_len_hist10,trace_len_hist2,trace_len_hist3,trace_len_hist4,trace_len_hist5,trace_len_hist6,trace_len_hist7,trace_len_hist8,trace_len_hist9,trace_len_iqr,trace_len_kurtosis,trace_len_kurtosis_hist,trace_len_max,trace_len_mean,trace_len_median,trace_len_min,trace_len_mode,trace_len_q1,trace_len_q3,trace_len_skewness,trace_len_skewness_hist,trace_len_std,trace_len_variance,kurtosis_variant_occurrence,mean_variant_occurrence,ratio_most_common_variant,ratio_top_10_variants,ratio_top_1_variants,ratio_top_20_variants,ratio_top_50_variants,ratio_top_5_variants,ratio_top_75_variants,skewness_variant_occurrence,std_variant_occurrence,activities_iqr,activities_kurtosis,activities_max,activities_mean,activities_median,activities_min,activities_q1,activities_q3,activities_skewness,activities_std,activities_variance,n_unique_activities,n_unique_start_activities,start_activities_iqr,start_activities_kurtosis,start_activities_max,start_activities_mean,start_activities_median,start_activities_min,start_activities_q1,start_activities_q3,start_activities_skewness,start_activities_std,start_activities_variance,end_activities_iqr,end_activities_kurtosis,end_activities_max,end_activities_mean,end_activities_median,end_activities_min,end_activities_q1,end_activities_q3,end_activities_skewness,end_activities_std,end_activities_variance,n_unique_end_activities,eventropy_global_block,eventropy_global_block_flattened,eventropy_k_block_diff_1,eventropy_k_block_diff_3,eventropy_k_block_diff_5,eventropy_k_block_ratio_1,eventropy_k_block_ratio_3,eventropy_k_block_ratio_5,eventropy_knn_3,eventropy_knn_5,eventropy_knn_7,eventropy_lempel_ziv,eventropy_lempel_ziv_flattened,eventropy_prefix,eventropy_prefix_flattened,eventropy_trace,epa_variant_entropy,epa_normalized_variant_entropy,epa_sequence_entropy,epa_normalized_sequence_entropy,epa_sequence_entropy_linear_forgetting,epa_normalized_sequence_entropy_linear_forgetting,epa_sequence_entropy_exponential_forgetting,epa_normalized_sequence_entropy_exponential_forgetting,ratio_variants_per_number_of_traces
2
+ genELBPIC15f4_0604_0003,8616,4031,1.0086445672512825,8.700230419287818,8.516920996327995,2.1832133718212567,6.58111248846037,0.05713165933282198,1.682074468800883e-05,0.009932649738269211,0.0033136867035377378,0.0012279143622246447,0.0005214430853282738,0.00017661781922409254,0.0001093348404720574,3.364148937601766e-05,0.0,9.0,11.77613857723645,4.64306597180025,141,11.964136490250697,7.0,3,3,5.0,14.0,2.836323931248485,2.5294876299887217,12.067561272744191,145.6260350714354,1651.5545366193303,2.137434879682461,0.09099350046425256,0.5789229340761374,0.40401578458681525,0.6256963788300836,0.766016713091922,0.5258820798514392,0.883008356545961,36.276105773051086,15.574023282690577,2184.5,1.9085746306932307,34121,12885.375,8627.0,8584,8616.0,10800.5,1.8663249384138656,8507.416043333898,72376127.734375,8,2,2111.0,-2.0,6419,4308.0,4308.0,2197,3252.5,5363.5,0.0,2111.0,4456321.0,768.0,0.0021026107788850723,4895,1723.2,832.0,495,813.0,1581.0,1.331337855426617,1625.5283940922102,2642342.56,5,15.897,16.276,2.756,1.525,1.375,2.756,2.016,1.775,6.564,6.07,5.761,1.405,1.786,12.139,13.493,9.703,365917.06171394786,0.7166786736830569,651595.1462643282,0.5475971681938718,62016.045914910814,0.05211796208164211,266396.7627350506,0.22387845232743814,0.46785051067780875
3
+ genELHD_0254_0517,6822,565,1.1300022933733087,8.390788875278787,1.9006921917027269,2.263915758458681,1.4763543408149593,0.28822871537617945,0.00010858116985352402,0.04077222927999826,0.02383356678284851,0.006080545511797346,0.005591930247456488,0.002823110416191621,0.0017915893025831464,0.0006514870191211442,0.0004886152643408582,2.0,9.718268017319556,4.770965470001153,28,2.8346525945470535,1.0,1,1,1.0,3.0,2.765986310146101,2.5637920433464965,3.2031639327547703,10.260259180101007,226.4931382842208,12.07433628318584,0.24860744649662855,0.9079448841981823,0.6807387862796834,0.9321313397830548,0.9585165640574611,0.8717384931105248,0.9791849897390794,14.639488482439702,105.6342402074512,1283.0,8.118508585327676,6848,1137.5294117647059,472.0,208,413.0,1696.0,2.9234849385484285,1541.823981624173,2377221.1903114184,17,10,294.25,2.299363631971671,3383,682.2,217.0,101,121.75,416.0,1.9301655015244086,1008.2924972447232,1016653.7600000001,334.5,2.8813625853874614,3383,620.1818181818181,157.0,79,104.5,439.0,2.0614116860983223,981.5564465945092,963453.0578512397,11,9.069,10.932,3.265,0.908,0.67,3.265,1.808,1.456,4.81,4.359,4.05,0.696,2.01,6.995,10.12,4.469,16958.33766640406,0.7450438396474315,70379.87102533762,0.36874603139171797,9719.481922433943,0.050923940806750986,30545.050254490514,0.16003675334882345,0.08282028730577544
4
+ genELRTFMP_0112_0376,6822,565,1.1300022933733087,8.390788875278787,1.9006921917027269,2.263915758458681,1.4763543408149593,0.28822871537617945,0.00010858116985352402,0.04077222927999826,0.02383356678284851,0.006080545511797346,0.005591930247456488,0.002823110416191621,0.0017915893025831464,0.0006514870191211442,0.0004886152643408582,2.0,9.718268017319556,4.770965470001153,28,2.8346525945470535,1.0,1,1,1.0,3.0,2.765986310146101,2.5637920433464965,3.2031639327547703,10.260259180101007,226.4931382842208,12.07433628318584,0.24860744649662855,0.9079448841981823,0.6807387862796834,0.9321313397830548,0.9585165640574611,0.8717384931105248,0.9791849897390794,14.639488482439702,105.6342402074512,1283.0,8.118508585327676,6848,1137.5294117647059,472.0,208,413.0,1696.0,2.9234849385484285,1541.823981624173,2377221.1903114184,17,10,294.25,2.299363631971671,3383,682.2,217.0,101,121.75,416.0,1.9301655015244086,1008.2924972447232,1016653.7600000001,334.5,2.8813625853874614,3383,620.1818181818181,157.0,79,104.5,439.0,2.0614116860983223,981.5564465945092,963453.0578512397,11,9.069,10.932,3.265,0.908,0.67,3.265,1.808,1.456,4.81,4.359,4.05,0.696,2.01,6.995,10.12,4.469,16958.33766640406,0.7450438396474315,70379.87102533762,0.36874603139171797,9719.481922433943,0.050923940806750986,30545.050254490514,0.16003675334882345,0.08282028730577544
data/validation/genELexperiment1_04_02.json CHANGED
@@ -1 +1 @@
1
- {"ratio_top_20_variants": 0.20017714791851196, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "experiment1"}
 
1
+ {"ratio_top_20_variants": 0.20017714791851196, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment1_04_02", "target_similarity": 0.7418932364693804}
data/validation/genELexperiment3_04_nan.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment3_04_nan", "target_similarity": 0.7418932612931086}
data/validation/genELexperiment4_nan_02.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"ratio_top_20_variants": 0.2, "log": "genELexperiment4_nan_02", "target_similarity": 1.0}