Spaces:

andreamalhera
/

igedi

Sleeping

App Files Files Community

Andrea MH commited on Aug 22, 2024

Commit

35cb48f

unverified ·

2 Parent(s): 6b0f858 377d68e

Merge pull request #21 from andreamalhera/20-dashboard-run-gedi-from-dashboard

Browse files

Files changed (15) hide show

.github/workflows/test_gedi.yml +2 -2
config.py +0 -2
config_files/config_layout.json +1 -1
data/validation/genELexperiment1_04_02.json +1 -1
data/validation/genELexperiment2_07_04.json +1 -1
data/validation/genELexperiment3_04_nan.json +1 -1
gedi/augmentation.py +1 -2
gedi/benchmark.py +4 -7
gedi/features.py +1 -3
gedi/plotter.py +1 -2
gedi/utils/io_helpers.py +8 -4
gedi/utils/logo.png +0 -0
setup.py +1 -1
utils/config_fabric.py +103 -26
utils/merge_csvs.py +0 -2

.github/workflows/test_gedi.yml CHANGED Viewed

@@ -63,11 +63,11 @@ jobs:
     - name: Compare output 1
       run:
-        diff data/validation/genELexperiment2_07_04.json output/features/grid_feat/2_enself_rt20v/genELexperiment2_07_04.json
     - name: Compare output 2
       run:
-        diff data/validation/genELexperiment1_04_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment1_04_02.json
     - name: Compare output 3
       run:

     - name: Compare output 1
       run:
+        diff data/validation/genELexperiment1_04_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment1_04_02.json
     - name: Compare output 2
       run:
+        diff data/validation/genELexperiment2_07_04.json output/features/grid_feat/2_enself_rt20v/genELexperiment2_07_04.json
     - name: Compare output 3
       run:

config.py CHANGED Viewed

@@ -1,10 +1,8 @@
 import json
-import os
 import warnings
 from utils.param_keys import PIPELINE_STEP, INPUT_PATH, OUTPUT_PATH
 from utils.param_keys.features import FEATURE_SET, FEATURE_PARAMS
-from tqdm import tqdm
 def get_model_params_list(alg_json_file: str) :#-> list[dict]:
     """

 import json
 import warnings
 from utils.param_keys import PIPELINE_STEP, INPUT_PATH, OUTPUT_PATH
 from utils.param_keys.features import FEATURE_SET, FEATURE_PARAMS
 def get_model_params_list(alg_json_file: str) :#-> list[dict]:
     """

config_files/config_layout.json CHANGED Viewed

@@ -9,7 +9,7 @@
   {
     "pipeline_step": "event_logs_generation",
     "output_path": "output/features/2_bpic_features/2_ense_rmcv_feat.csv",
-    "output_path": "data/test",
     "generator_params": {
       "experiment": "data/grid_objectives.csv",
       "experiment": {"input_path": "data/2_bpic_features.csv",

   {
     "pipeline_step": "event_logs_generation",
     "output_path": "output/features/2_bpic_features/2_ense_rmcv_feat.csv",
+    "output_path": "data/frontend/test",
     "generator_params": {
       "experiment": "data/grid_objectives.csv",
       "experiment": {"input_path": "data/2_bpic_features.csv",

data/validation/genELexperiment1_04_02.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"ratio_top_20_variants": 0.20017714791851196, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment1_04_02", "target_similarity": 0.~~6520971605578558~~}


1	+ {"ratio_top_20_variants": 0.20017714791851196, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment1_04_02", "target_similarity": 0.7418932364693804}

data/validation/genELexperiment2_07_04.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"ratio_top_20_variants": 0.38863337713534823, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment2_07_04", "target_similarity": 0.~~35199750692556764~~}


1	+ {"ratio_top_20_variants": 0.38863337713534823, "epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment2_07_04", "target_similarity": 0.6067951985524301}

data/validation/genELexperiment3_04_nan.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment3_04_nan", "target_similarity": 0.~~6520972056586477~~}


1	+ {"epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "genELexperiment3_04_nan", "target_similarity": 0.7418932612931086}

gedi/augmentation.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import pandas as pd
 from collections import Counter
 from datetime import datetime as dt
-from imblearn.over_sampling import SMOTE, SVMSMOTE, BorderlineSMOTE, KMeansSMOTE
-from sklearn.preprocessing import Normalizer
 from gedi.utils.matrix_tools import insert_missing_data
 from utils.param_keys import INPUT_PATH, OUTPUT_PATH
 from utils.param_keys.augmentation import AUGMENTATION_PARAMS, NO_SAMPLES, FEATURE_SELECTION, METHOD

 import pandas as pd
 from collections import Counter
 from datetime import datetime as dt
+from imblearn.over_sampling import SMOTE
 from gedi.utils.matrix_tools import insert_missing_data
 from utils.param_keys import INPUT_PATH, OUTPUT_PATH
 from utils.param_keys.augmentation import AUGMENTATION_PARAMS, NO_SAMPLES, FEATURE_SELECTION, METHOD

gedi/benchmark.py CHANGED Viewed

@@ -5,15 +5,12 @@ import pandas as pd
 import subprocess
 from datetime import datetime as dt
-from functools import partial, partialmethod
 from itertools import repeat
-from pathlib import Path
-from pm4py import read_xes, convert_to_bpmn, read_bpmn, convert_to_petri_net, check_soundness
 from pm4py import discover_petri_net_inductive, discover_petri_net_ilp, discover_petri_net_heuristics
-from pm4py import fitness_alignments, fitness_token_based_replay
-from pm4py import precision_alignments, precision_token_based_replay
-from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator
-from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator
 from pm4py.objects.bpmn.obj import BPMN
 from pm4py.objects.log.importer.xes import importer as xes_importer
 from gedi.utils.io_helpers import dump_features_json

 import subprocess
 from datetime import datetime as dt
+from functools import partialmethod
 from itertools import repeat
+from pm4py import convert_to_bpmn, read_bpmn, convert_to_petri_net, check_soundness
 from pm4py import discover_petri_net_inductive, discover_petri_net_ilp, discover_petri_net_heuristics
+from pm4py import fitness_alignments
+from pm4py import precision_alignments
 from pm4py.objects.bpmn.obj import BPMN
 from pm4py.objects.log.importer.xes import importer as xes_importer
 from gedi.utils.io_helpers import dump_features_json

gedi/features.py CHANGED Viewed

@@ -1,14 +1,12 @@
 import json
 import multiprocessing
-import numpy as np
 import pandas as pd
 import os
 from datetime import datetime as dt
 from functools import partial
 from feeed.feature_extractor import extract_features
-from pathlib import Path, PurePath
-from sklearn.impute import SimpleImputer
 from utils.param_keys import INPUT_PATH
 from utils.param_keys.features import FEATURE_PARAMS, FEATURE_SET
 from gedi.utils.io_helpers import dump_features_json

 import json
 import multiprocessing
 import pandas as pd
 import os
 from datetime import datetime as dt
 from functools import partial
 from feeed.feature_extractor import extract_features
+from pathlib import Path
 from utils.param_keys import INPUT_PATH
 from utils.param_keys.features import FEATURE_PARAMS, FEATURE_SET
 from gedi.utils.io_helpers import dump_features_json

gedi/plotter.py CHANGED Viewed

@@ -12,14 +12,13 @@ from matplotlib.axes import Axes
 from matplotlib.figure import Figure
 from matplotlib.lines import Line2D
 from utils.param_keys import PLOT_TYPE, PROJECTION, EXPLAINED_VAR, PLOT_3D_MAP
-from utils.param_keys import INPUT_PATH, OUTPUT_PATH, PIPELINE_STEP
 from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, PLOT_REFERENCE_FEATURE
 from utils.param_keys.plotter import REAL_EVENTLOG_PATH, FONT_SIZE, BOXPLOT_WIDTH
 from collections import defaultdict
 from sklearn.preprocessing import Normalizer, StandardScaler
 from sklearn.decomposition import PCA
-from sklearn.metrics.pairwise import euclidean_distances
 from gedi.generator import get_tasks
 from gedi.utils.io_helpers import get_keys_abbreviation
 from gedi.utils.io_helpers import read_csvs, select_instance

 from matplotlib.figure import Figure
 from matplotlib.lines import Line2D
 from utils.param_keys import PLOT_TYPE, PROJECTION, EXPLAINED_VAR, PLOT_3D_MAP
+from utils.param_keys import OUTPUT_PATH, PIPELINE_STEP
 from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, PLOT_REFERENCE_FEATURE
 from utils.param_keys.plotter import REAL_EVENTLOG_PATH, FONT_SIZE, BOXPLOT_WIDTH
 from collections import defaultdict
 from sklearn.preprocessing import Normalizer, StandardScaler
 from sklearn.decomposition import PCA
 from gedi.generator import get_tasks
 from gedi.utils.io_helpers import get_keys_abbreviation
 from gedi.utils.io_helpers import read_csvs, select_instance

gedi/utils/io_helpers.py CHANGED Viewed

@@ -6,7 +6,7 @@ import re
 import shutil
 import numpy as np
 from collections import defaultdict
-from pathlib import Path, PurePath
 from scipy.spatial.distance import euclidean
 def select_instance(source_dir, log_path, destination=os.path.join("output","generated","instance_selection")):
@@ -86,7 +86,11 @@ def dump_features_json(features: dict, output_path, content_type="features"):
 def compute_similarity(v1, v2):
-    # HOTFIX: Rename 'ratio_unique_traces_per_trace
     if 'ratio_unique_traces_per_trace' in v1:
         v1['ratio_variants_per_number_of_traces'] = v1.pop('ratio_unique_traces_per_trace')
@@ -104,7 +108,7 @@ def compute_similarity(v1, v2):
     else:
         # Calculate Euclidean Similarity
-        target_similarity = 1-euclidean(vec1, vec2)
-        #print("VECTORS: ", vec1, vec2, target_similarity)
         return target_similarity

 import shutil
 import numpy as np
 from collections import defaultdict
+from pathlib import PurePath
 from scipy.spatial.distance import euclidean
 def select_instance(source_dir, log_path, destination=os.path.join("output","generated","instance_selection")):
 def compute_similarity(v1, v2):
+    # Convert all values to float except for the value for the key "Log"
+    v1 = {k: (float(v) if k != "log" else v) for k, v in v1.items()}
+    v2 = {k: (float(v) if k != "log" else v) for k, v in v2.items()}
+    # HOTFIX: Rename 'ratio_unique_traces_per_trace'
     if 'ratio_unique_traces_per_trace' in v1:
         v1['ratio_variants_per_number_of_traces'] = v1.pop('ratio_unique_traces_per_trace')
     else:
         # Calculate Euclidean Similarity
+        target_similarity = 1 / (1 + euclidean(vec1, vec2))
+        # print("VECTORS: ", vec1, vec2, target_similarity)
         return target_similarity

gedi/utils/logo.png ADDED Viewed

setup.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from setuptools import setup, find_packages
 import os
 with open("README.md", "r") as fh:

+from setuptools import setup
 import os
 with open("README.md", "r") as fh:

utils/config_fabric.py CHANGED Viewed

@@ -1,24 +1,59 @@
-from copy import deepcopy
-from importlib import reload
 from itertools import product as cproduct
 from itertools import combinations
 from pylab import *
-import itertools
 import json
 import math
 import os
 import pandas as pd
-import random
 import streamlit as st
 import subprocess
 st.set_page_config(layout='wide')
 INPUT_XES="output/inputlog_temp.xes"
-"""
-# Configuration File fabric for
-## GEDI: **G**enerating **E**vent **D**ata with **I**ntentional Features for Benchmarking Process Mining
-"""
 def double_switch(label_left, label_right, third_label=None, fourth_label=None):
     if third_label==None and fourth_label==None:
         # Create two columns for the labels and toggle switch
@@ -185,10 +220,11 @@ def set_generator_experiments(generator_params):
                 with col2:
                     sel_features = feature_select()
                 values_indexes = ["value "+str(i+1) for i in range(num_values)]
                 values_defaults = ['*(1+2*0.'+str(i)+')' for i in range(num_values)]
                 cross_labels =  [feature[0]+': '+feature[1] for feature in list(cproduct(sel_features,values_indexes))]
-                cross_values = [round(eval(str(combination[0])+combination[1]), 2) for combination in list(cproduct(list(generator_params['experiment'].values()), values_defaults))]
                 parameters = split_list(list(input_multicolumn(cross_labels, cross_values, n_cols=num_values)), len(sel_features))
                 tasks = f"list({parameters})"
@@ -234,6 +270,7 @@ def set_generator_experiments(generator_params):
     return generator_params
 if __name__ == '__main__':
     config_layout = json.load(open("config_files/config_layout.json"))
     type(config_layout)
     step_candidates = ["instance_augmentation","event_logs_generation","feature_extraction","benchmark_test"]
@@ -266,25 +303,65 @@ if __name__ == '__main__':
     output_path = st.text_input("Output file path", "config_files/experiment_config.json")
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
     save_labels = ["Save config file", "Save and run config_file"]
-    save_labels = ["Save configuration file"]
-    #create_button, create_run_button = multi_button(save_labels)
-    create_button = multi_button(save_labels)
-    # FIXME: Bug: automatically updates the experiment_config.json file even without pressing the save button
-    if create_button: # or create_run_button:
         with open(output_path, "w") as f:
             f.write(config_file)
         st.write("Saved configuration in ", output_path, ". Run command:")
-        #if create_run_button:
-        if True:
-            var = f"python -W ignore main.py -a {output_path}"
-            st.code(var, language='bash')
-        if False: #FIXME: Command fails when using multiprocessing
-            command = var.split()
-            # Run the command
             result = subprocess.run(command, capture_output=True, text=True)
-            if len(result.stderr)==0:
-                st.write(result.stdout)
-            else:
-                st.write("ERROR: ", result.stderr)

 from itertools import product as cproduct
 from itertools import combinations
+from pathlib import Path
 from pylab import *
+import base64
 import json
 import math
 import os
 import pandas as pd
 import streamlit as st
 import subprocess
+import time
+import shutil
 st.set_page_config(layout='wide')
 INPUT_XES="output/inputlog_temp.xes"
+LOGO_PATH="gedi/utils/logo.png"
+def get_base64_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode()
+def play_header():
+    # Convert local image to base64
+    logo_base64 = get_base64_image(LOGO_PATH)
+    # HTML and CSS for placing the logo at the top left corner
+    head1, head2 = st.columns([1,8])
+    head1.markdown(
+        f"""
+        <style>
+        .header-logo {{
+            display: flex;
+            align-items: center;
+            justify-content: flex-start;
+        }}
+        .header-logo img {{
+            max-width: 120px; /* Adjust the size as needed */
+            height: auto;
+        }}
+        </style>
+        <div class="header-logo">
+            <img src="data:image/png;base64,{logo_base64}" alt="Logo">
+        </div>
+        """,
+        unsafe_allow_html=True
+    )
+    with head2:
+        """
+        # interactive GEDI
+        """
+    """
+    ## **G**enerating **E**vent **D**ata with **I**ntentional Features for Benchmarking Process Mining
+    """
+    return
 def double_switch(label_left, label_right, third_label=None, fourth_label=None):
     if third_label==None and fourth_label==None:
         # Create two columns for the labels and toggle switch
                 with col2:
                     sel_features = feature_select()
+                filtered_dict = {key: generator_params['experiment'][key] for key in sel_features if key in generator_params['experiment']}
                 values_indexes = ["value "+str(i+1) for i in range(num_values)]
                 values_defaults = ['*(1+2*0.'+str(i)+')' for i in range(num_values)]
                 cross_labels =  [feature[0]+': '+feature[1] for feature in list(cproduct(sel_features,values_indexes))]
+                cross_values = [round(eval(str(combination[0])+combination[1]), 2) for combination in list(cproduct(list(filtered_dict.values()), values_defaults))]
                 parameters = split_list(list(input_multicolumn(cross_labels, cross_values, n_cols=num_values)), len(sel_features))
                 tasks = f"list({parameters})"
     return generator_params
 if __name__ == '__main__':
+    play_header()
     config_layout = json.load(open("config_files/config_layout.json"))
     type(config_layout)
     step_candidates = ["instance_augmentation","event_logs_generation","feature_extraction","benchmark_test"]
     output_path = st.text_input("Output file path", "config_files/experiment_config.json")
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
     save_labels = ["Save config file", "Save and run config_file"]
+    #save_labels = ["Save configuration file"]
+    create_button, create_run_button = multi_button(save_labels)
+    #create_button = multi_button(save_labels)
+    if create_button or create_run_button:
         with open(output_path, "w") as f:
             f.write(config_file)
         st.write("Saved configuration in ", output_path, ". Run command:")
+        create_button = False
+        var = f"python -W ignore main.py -a {output_path}"
+        st.code(var, language='bash')
+        if create_run_button:
+            command = var.split()
+            progress_bar = st.progress(0)  # Initialize the progress bar
+            directory = Path(step_config['output_path']).parts
+            path = os.path.join(directory[0], 'features', *directory[1:])
+            if os.path.exists(path): shutil.rmtree(path)
+            # Simulate running the command with a loop and updating the progress bar
+            for i in range(95):
+                time.sleep(0.2)  # Simulate the time taken for each step
+                progress_bar.progress(i + 1)
+            # Run the actual command
             result = subprocess.run(command, capture_output=True, text=True)
+            st.write("## Results")
+            # st.write(*step_config['generator_params']['experiment'][0].keys(), "log name", "target similarity")
+            directory = Path(step_config['output_path']).parts
+            path = os.path.join(directory[0], 'features', *directory[1:])
+            dataframes = []
+            # Walk through all directories and files
+            for root, dirs, files in os.walk(path):
+                feature_files = [os.path.join(root, file) for file in files]
+                for feature_file in feature_files:
+                    df_temp = pd.read_json(feature_file,lines=True)
+                    dataframes.append(df_temp)
+                    # Print the contents of the JSON file
+                    # st.write(*config_targets.values(), data['log'], data['target_similarity'])
+            dataframes = pd.concat(dataframes, ignore_index=True)
+            # dataframes = dataframes.sort_values(by=['log'])
+            dataframes = dataframes.set_index('log')
+            col1, col2 = st.columns([2, 3])  # Adjust the ratio as needed
+            with col1:
+                st.dataframe(dataframes)
+            with col2:
+                plt.figure(figsize=(4, 2))
+                plt.plot(dataframes.index, dataframes['target_similarity'], 'o-')
+                plt.xlabel('log', fontsize=5)
+                plt.ylabel('target_similarity', fontsize=5)
+                plt.xticks(rotation=45, ha='right', fontsize=5)
+                plt.tight_layout()
+                st.pyplot(plt)
+            # Optional: Updating the progress bar to indicate completion
+            progress_bar.progress(100)

utils/merge_csvs.py CHANGED Viewed

@@ -1,9 +1,7 @@
 import os
 import pandas as pd
 import sys
-import tqdm
-from gedi.utils.io_helpers import sort_files
 FILE_START = sys.argv[1]
 ROOT_PATH, FILE_START = os.path.split(FILE_START)

 import os
 import pandas as pd
 import sys
 FILE_START = sys.argv[1]
 ROOT_PATH, FILE_START = os.path.split(FILE_START)