Spaces:

andreamalhera
/

igedi

Sleeping

App Files Files Community

Andrea Maldonado commited on Jul 10, 2024

Commit

277c251

1 Parent(s): 1296e6b

WIP Adds config file fabric

Browse files

Files changed (3) hide show

README.md +10 -2
config_files/config_layout.json +52 -0
utils/config_fabric.py +151 -0

README.md CHANGED Viewed

@@ -63,7 +63,7 @@ The JSON file consists of the following key-value pairs:
 - real_eventlog_path: defines the file with the features extracted from the real event logs
 - plot_type: defines the style of the output plotting (possible values: violinplot, boxplot)
 - font_size: label font size of the output plot
-- boxplot_widht: width of the violinplot/boxplot
 ### Generation
@@ -153,8 +153,16 @@ To execute the experiments with grid targets, a single [configuration](config_fi
 conda activate gedi
 python execute_grid_experiments.py config_files/algorithm/grid_2obj
 ```
-We employ the [experiment_grid_2obj_configfiles_fabric.ipynb](notebooks/experiment_grid_2obj_configfiles_fabric.ipynb) to create all necessary [configuration](config_files/algorithm/grid_2obj) and [objective](data/grid_2obj) files for this experiment. For more details about these config_files, please refer to [Feature Extraction](#feature-extraction), [Generation](#generation), and [Benchmark](#benchmark).
 ### Visualizations
 To run the visualizations, we employ [jupyter notebooks](https://jupyter.org/install) and [add the installed environment to the jupyter notebook](https://medium.com/@nrk25693/how-to-add-your-conda-environment-to-your-jupyter-notebook-in-just-4-steps-abeab8b8d084). We then start all visualizations by running e.g.: `jupyter noteboook`. In the following, we describe the `.ipynb`-files in the folder `\notebooks` to reproduce the figures from our paper.

 - real_eventlog_path: defines the file with the features extracted from the real event logs
 - plot_type: defines the style of the output plotting (possible values: violinplot, boxplot)
 - font_size: label font size of the output plot
+- boxplot_width: width of the violinplot/boxplot
 ### Generation
 conda activate gedi
 python execute_grid_experiments.py config_files/algorithm/grid_2obj
 ```
+We employ the [experiment_grid_2obj_configfiles_fabric.ipynb](notebooks/experiment_grid_2obj_configfiles_fabric.ipynb) to create all necessary [configuration](config_files/algorithm/grid_2obj) and [objective](data/grid_2obj) files for this experiment.
+For more details about these config_files, please refer to [Feature Extraction](#feature-extraction), [Generation](#generation), and [Benchmark](#benchmark).
+To create configuration files for grid objectives interactively, you can use the start the following dashboard:
+```
+streamlit run utils/config_fabric.py # To tunnel to local machine add: --server.port 8501 --server.headless true
+# In local machine (only in case you are tunneling):
+ssh -N -f -L 9000:localhost:8501 <user@remote_machine.com>
+open "http://localhost:9000/"
+```
 ### Visualizations
 To run the visualizations, we employ [jupyter notebooks](https://jupyter.org/install) and [add the installed environment to the jupyter notebook](https://medium.com/@nrk25693/how-to-add-your-conda-environment-to-your-jupyter-notebook-in-just-4-steps-abeab8b8d084). We then start all visualizations by running e.g.: `jupyter noteboook`. In the following, we describe the `.ipynb`-files in the folder `\notebooks` to reproduce the figures from our paper.

config_files/config_layout.json ADDED Viewed

	@@ -0,0 +1,52 @@

+[
+  {
+    "pipeline_step": "instance_augmentation",
+    "augmentation_params":{"method":"SMOTE", "no_samples":2,
+        "feature_selection": ["ratio_top_20_variants", "epa_normalized_sequence_entropy_linear_forgetting"]},
+    "input_path": "data/test/bpic_features.csv",
+    "output_path": "output"
+  },
+  {
+    "pipeline_step": "event_logs_generation",
+    "output_path": "output/features/2_bpic_features/2_ense_rmcv_feat.csv",
+    "output_path": "data/test",
+    "generator_params": {
+      "experiment": "data/grid_objectives.csv",
+      "experiment": {"input_path": "data/2_bpic_features.csv",
+        "objectives": ["ratio_top_20_variants", "epa_normalized_sequence_entropy_linear_forgetting"]},
+      "experiment": [
+          {"epa_normalized_sequence_entropy_linear_forgetting": 0.2, "ratio_top_20_variants": 0.4},
+          {"epa_normalized_sequence_entropy_linear_forgetting": 0.4, "ratio_top_20_variants": 0.7}
+        ],
+      "experiment": {"epa_normalized_sequence_entropy_linear_forgetting": 0.2, "ratio_top_20_variants": 0.4},
+      "config_space": {
+        "mode": [5, 20],
+        "sequence": [0.01, 1],
+        "choice": [0.01, 1],
+        "parallel": [0.01, 1],
+        "loop": [0.01, 1],
+        "silent": [0.01, 1],
+        "lt_dependency": [0.01, 1],
+        "num_traces": [10, 100],
+        "duplicate": [0],
+        "or": [0]
+      },
+      "n_trials": 2
+    }
+  },
+  {
+    "pipeline_step": "feature_extraction",
+    "input_path": "data/test",
+    "feature_params": {"feature_set":["trace_length"]},
+    "output_path": "output/plots",
+    "real_eventlog_path": "data/BaselineED_feat.csv",
+    "plot_type": "boxplot"
+  },
+  {
+    "pipeline_step": "benchmark_test",
+    "benchmark_task": "discovery",
+    "input_path":"data/test",
+    "output_path":"output",
+    "miners" : ["inductive", "heu", "imf", "ilp"]
+  }
+]

utils/config_fabric.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from copy import deepcopy
+from importlib import reload
+from itertools import product
+from pylab import *
+import itertools
+import json
+import os
+import pandas as pd
+import pm4py
+import random
+import streamlit as st
+st.set_page_config(layout='wide')
+INPUT_XES="output/inputlog_temp.xes"
+"""
+# Configuration File fabric for
+## GEDI: **G**enerating **E**vent **D**ata with **I**ntentional Features for Benchmarking Process Mining
+"""
+def double_switch(label_left, label_right):
+    # Create two columns for the labels and toggle switch
+    col0, col1, col2, col3, col4 = st.columns([4,1, 1, 1,4])
+    # Add labels to the columns
+    with col1:
+        st.write(label_left)
+    with col2:
+        # Create the toggle switch
+        toggle_option = st.toggle(" ",value=False,
+            key="toggle_switch_"+label_left,
+        )
+    with col3:
+        st.write(label_right)
+    return toggle_option
+def view(config_file):
+    st.write(config_file)
+def get_ranges(stats, tuple_values):
+    col_for_row = ", ".join([f"x[\'{i}\'].astype(float)" for i in tuple_values])
+    stats['range'] = stats.apply(lambda x: tuple([eval(col_for_row)]), axis=1)
+    #tasks = eval(f"list(itertools.product({(parameters*n_para_obj)[:-2]}))")
+    result = [f"np.around({x}, 2)" for x in stats['range']]
+    result = ", ".join(result)
+    return result
+def create_objectives_grid(df, objectives, n_para_obj=2):
+    parameters_o = "objectives, "
+    sel_features = df.index.to_list()
+    if n_para_obj==len(objectives):
+        parameters = get_ranges(df, sorted(objectives))
+        tasks = eval(f"list(itertools.product({parameters}))")[0]
+        cartesian_product = list(product(*tasks))
+        experiments = [{key: value[idx] for idx, key in enumerate(sel_features)} for value in cartesian_product]
+        return experiments
+    else:
+        if n_para_obj==1:
+            experiments = [[exp] for exp in objectives]
+        else:
+            experiments = eval(f"[exp for exp in list(itertools.product({(parameters_o*n_para_obj)[:-2]})) if exp[0]!=exp[1]]")
+        experiments = list(set([tuple(sorted(exp)) for exp in experiments]))
+        parameters = "np.around(np.arange(0.0, 1.5,0.5),2), "
+        tasks = eval(f"list(itertools.product({(parameters*n_para_obj)[:-2]}))")
+    print("TASKS", tasks,  type(parameters), type(n_para_obj), parameters*n_para_obj)
+    #print(len(experiments), experiments)
+    print(len(tasks))
+    for exp in experiments:
+        df = pd.DataFrame(data=tasks, columns=["task", *exp])
+        #experiment_path = os.path.join('..','data', f'grid_{n_para_obj}obj')
+        #os.makedirs(experiment_path, exist_ok=True)
+        #experiment_path = os.path.join(experiment_path, f"grid_{len(df.columns)-1}objectives_{abbrev_obj_keys(exp)}.csv")
+        #df.to_csv(experiment_path, index=False)
+        #print(f"Saved experiment in {experiment_path}")
+        #write_generator_experiment(experiment_path, objectives=exp)
+def set_up(generator_params):
+    create_button = False
+    experiments = []
+    col1, col2 = st.columns(2)
+    if True:
+        grid_option = double_switch("Point-", "Grid-based")
+        csv_option = double_switch("Manual", "From CSV")
+    if csv_option:
+        uploaded_file = st.file_uploader(f"Pick a csv-file containing feature values for features:", type="csv")
+        if uploaded_file is not None:
+            df = pd.read_csv(uploaded_file)
+            sel_features = st.multiselect("Selected features", list(df.columns))
+            df = df[sel_features]
+            if grid_option:
+                add_quantile = st.slider('Add %-quantile', min_value=0.0, max_value=100.0, value=50.0, step=5.0)
+                stats = df.describe().transpose()
+                stats[str(int(add_quantile))+"%"] = df.quantile(q=add_quantile/100)
+                view(stats)
+                tuple_values = st.multiselect("Tuples including", list(stats.columns)[3:], default=['min', 'max'])
+                experiments = create_objectives_grid(stats, tuple_values, n_para_obj=len(tuple_values))
+            else:
+                view(df)
+                experiments = df.to_dict(orient='records')
+    else:
+        sel_features = st.multiselect("Selected features", list(generator_params['experiment'].keys()))
+        for sel_feature in sel_features:
+            generator_params['experiment'][sel_feature] = float(st.text_input(sel_feature, generator_params['experiment'][sel_feature]))
+    generator_params['experiment'] = experiments
+    st.write(f"...result in {len(generator_params['experiment'])} experiments")
+    """
+    #### Configuration space
+    """
+    for key in generator_params['config_space'].keys():
+        generator_params['config_space'][key] = st.text_input(key, generator_params['config_space'][key])
+    #generator_params['config_space'] = st.text_input('config_space', generator_params['config_space'])
+    generator_params['n_trials'] = int(st.text_input('n_trials', generator_params['n_trials']))
+    return generator_params
+if __name__ == '__main__':
+    config_layout = json.load(open("config_files/config_layout.json"))
+    type(config_layout)
+    step_candidates = ["instance_augmentation","event_logs_generation","feature_extraction","benchmark_test"]
+    pipeline_steps = st.multiselect(
+        "Choose pipeline step",
+        step_candidates,
+        []
+    )
+    step_configs = []
+    set_col, view_col = st.columns([3, 2])
+    for pipeline_step in pipeline_steps:
+        step_config = [d for d in config_layout if d['pipeline_step'] == pipeline_step][0]
+        with set_col:
+            st.header(pipeline_step)
+            for step_key in step_config.keys():
+                if step_key == "generator_params":
+                    st.subheader("Set-up experiments")
+                    step_config[step_key] = set_up(step_config[step_key])
+                elif step_key != "pipeline_step":
+                    step_config[step_key] = st.text_input(step_key, step_config[step_key])
+        with view_col:
+            view(step_config)
+        step_configs.append(step_config)
+    config_file = json.dumps(step_configs, indent=4)
+    output_path = st.text_input("Output file path", "config_files/experiment_config.json")
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    create_button = st.button("Save config file")
+    if create_button:
+        with open(output_path, "w") as f:
+            f.write(config_file)