Spaces:

andreamalhera
/

igedi

Sleeping

App Files Files Community

Andrea MH commited on Aug 20, 2024

Commit

35765eb

unverified ·

2 Parent(s): 6d970d7 08a0359

Merge pull request #12 from andreamalhera/11-debug-run-generation-for-f-triangle

Browse files

Files changed (10) hide show

.conda.yml +151 -17
.github/workflows/test_gedi.yml +8 -0
data/test/grid_feat.csv +2 -0
data/validation/genELexperiment3_04.json +1 -0
data/validation/genELexperiment4_02.json +1 -0
gedi/benchmark.py +1 -1
gedi/features.py +1 -1
gedi/generator.py +9 -7
gedi/utils/io_helpers.py +7 -10
utils/merge_jsons.py +5 -5

.conda.yml CHANGED Viewed

@@ -1,22 +1,156 @@
 name: gedi
 channels:
   - conda-forge
 dependencies:
-  - python=3.9
-  - numpy=1.23.1
-  - scikit-learn=1.2.2
-  - scipy
-  - pandas
-  - matplotlib
-  - pip
   - pip:
-    - pm4py==2.7.2
-    - imblearn
-    - seaborn
-    - feeed
-    - smac
-    - ConfigSpace==0.7.1
-    - tqdm
-    - Levenshtein
-    - streamlit
-    - streamlit-toggle-switch

 name: gedi
 channels:
   - conda-forge
+  - defaults
 dependencies:
+  - asttokens=2.4.1=pyhd8ed1ab_0
+  - backcall=0.2.0=pyh9f0ad1d_0
+  - certifi=2024.2.2=pyhd8ed1ab_0
+  - colorama=0.4.6=pyhd8ed1ab_0
+  - comm=0.2.2=pyhd8ed1ab_0
+  - cycler=0.12.1=pyhd8ed1ab_0
+  - decorator=5.1.1=pyhd8ed1ab_0
+  - executing=2.0.1=pyhd8ed1ab_0
+  - importlib-metadata=7.1.0=pyha770c72_0
+  - importlib-resources=6.4.0=pyhd8ed1ab_0
+  - importlib_metadata=7.1.0=hd8ed1ab_0
+  - importlib_resources=6.4.0=pyhd8ed1ab_0
+  - jedi=0.19.1=pyhd8ed1ab_0
+  - joblib=1.4.0=pyhd8ed1ab_0
+  - jupyter_client=8.6.1=pyhd8ed1ab_0
+  - matplotlib-inline=0.1.7=pyhd8ed1ab_0
+  - munkres=1.1.4=pyh9f0ad1d_0
+  - nest-asyncio=1.6.0=pyhd8ed1ab_0
+  - packaging=24.0=pyhd8ed1ab_0
+  - parso=0.8.4=pyhd8ed1ab_0
+  - pickleshare=0.7.5=py_1003
+  - pip=24.0=pyhd8ed1ab_0
+  - platformdirs=4.2.0=pyhd8ed1ab_0
+  - ply=3.11=pyhd8ed1ab_2
+  - prompt-toolkit=3.0.42=pyha770c72_0
+  - prompt_toolkit=3.0.42=hd8ed1ab_0
+  - pure_eval=0.2.2=pyhd8ed1ab_0
+  - pygments=2.17.2=pyhd8ed1ab_0
+  - pyparsing=3.1.2=pyhd8ed1ab_0
+  - python-dateutil=2.9.0=pyhd8ed1ab_0
+  - python-tzdata=2024.1=pyhd8ed1ab_0
+  - python_abi=3.9=4_cp39
+  - pytz=2024.1=pyhd8ed1ab_0
+  - setuptools=69.5.1=pyhd8ed1ab_0
+  - six=1.16.0=pyh6c4a22f_0
+  - stack_data=0.6.2=pyhd8ed1ab_0
+  - threadpoolctl=3.4.0=pyhc1e730c_0
+  - toml=0.10.2=pyhd8ed1ab_0
+  - tomli=2.0.1=pyhd8ed1ab_0
+  - traitlets=5.14.3=pyhd8ed1ab_0
+  - typing_extensions=4.11.0=pyha770c72_0
+  - tzdata=2024a=h0c530f3_0
+  - wcwidth=0.2.13=pyhd8ed1ab_0
+  - wheel=0.43.0=pyhd8ed1ab_1
+  - zipp=3.17.0=pyhd8ed1ab_0
   - pip:
+      - altair==5.3.0
+      - asttokens==2.4.1
+      - attrs==23.2.0
+      - backcall==0.2.0
+      - blinker==1.8.2
+      - brotli==1.1.0
+      - cachetools==5.4.0
+      - certifi==2024.2.2
+      - charset-normalizer==3.3.2
+      - click==8.1.7
+      - cloudpickle==3.0.0
+      - comm==0.2.2
+      - configspace==0.7.1
+      - contourpy==1.2.1
+      - cycler==0.12.1
+      - cvxopt==1.3.2
+      - dask==2024.4.1
+      - dask-jobqueue==0.8.5
+      - debugpy==1.8.1
+      - decorator==5.1.1
+      - deprecation==2.1.0
+      - distributed==2024.4.1
+      - emcee==3.1.4
+      - executing==2.0.1
+      - feeed==1.2.0
+      - fsspec==2024.3.1
+      - fonttools==4.51.0
+      - gitdb==4.0.11
+      - gitpython==3.1.43
+      - graphviz==0.20.3
+      - idna==3.7
+      - importlib-metadata==7.1.0
+      - importlib-resources==6.4.0
+      - imbalanced-learn==0.12.2
+      - imblearn==0.0
+      - intervaltree==3.1.0
+      - ipykernel==6.29.3
+      - ipython==8.12.0
+      - jedi==0.19.1
+      - jinja2==3.1.3
+      - joblib==1.4.0
+      - jsonschema==4.23.0
+      - jsonschema-specifications==2023.12.1
+      - jupyter_client==8.6.1
+      - jupyter_core==5.7.2
+      - kiwisolver==1.4.5
+      - levenshtein==0.23.0
+      - llvmlite==0.42.0
+      - locket==1.0.0
+      - lxml==5.2.1
+      - markdown-it-py==3.0.0
+      - markupsafe==2.1.5
+      - matplotlib==3.8.4
+      - matplotlib-inline==0.1.7
+      - mdurl==0.1.2
+      - more-itertools==10.2.0
+      - msgpack==1.0.8
+      - munkres==1.1.4
+      - networkx==3.2.1
+      - numba==0.59.1
+      - numpy==1.26.4
+      - opyenxes==0.3.0
+      - partd==1.4.1
+      - pandas==2.2.2
+      - pm4py==2.7.2
+      - protobuf==5.27.2
+      - pyarrow==17.0.0
+      - pydeck==0.9.1
+      - pydotplus==2.0.2
+      - pynisher==1.0.10
+      - pyrfr==0.9.0
+      - python-dateutil==2.9.0
+      - pyyaml==6.0.1
+      - rapidfuzz==3.8.1
+      - referencing==0.35.1
+      - regex==2023.12.25
+      - requests==2.32.3
+      - rich==13.7.1
+      - rpds-py==0.19.0
+      - seaborn==0.13.2
+      - scikit-learn==1.2.2
+      - scipy==1.13.0
+      - slicer==0.0.8
+      - smac==2.0.2
+      - smmap==5.0.1
+      - sortedcontainers==2.4.0
+      - stack_data==0.6.2
+      - streamlit==1.36.0
+      - stringdist==1.0.9
+      - tabulate==0.9.0
+      - tblib==3.0.0
+      - tenacity==8.5.0
+      - threadpoolctl==3.4.0
+      - toml==0.10.2
+      - tomli==2.0.1
+      - tornado==6.4
+      - tqdm==4.65.0
+      - toolz==0.12.1
+      - tzdata==2024.1
+      - urllib3==2.2.1
+      - watchdog==4.0.1
+      - xgboost==2.1.0
+      - zict==3.0.0
+      - zipp==3.17.0
+      - zstd==1.5.5.1

.github/workflows/test_gedi.yml CHANGED Viewed

@@ -69,6 +69,14 @@ jobs:
       run:
         diff data/validation/genELexperiment1_04_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment1_04_02.json
   test_benchmark:
     runs-on: ubuntu-latest

       run:
         diff data/validation/genELexperiment1_04_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment1_04_02.json
+    - name: Compare output 3
+      run:
+        diff data/validation/genELexperiment3_04.json output/features/grid_feat/2_enself_rt20v/genELexperiment3_04_nan.json
+    - name: Compare output 4
+      run:
+        diff data/validation/genELexperiment4_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment4_nan_02.json
   test_benchmark:
     runs-on: ubuntu-latest

data/test/grid_feat.csv CHANGED Viewed

@@ -1,3 +1,5 @@
 log,ratio_top_20_variants,epa_normalized_sequence_entropy_linear_forgetting
 experiment1,0.2,0.4
 experiment2,0.4,0.7

 log,ratio_top_20_variants,epa_normalized_sequence_entropy_linear_forgetting
 experiment1,0.2,0.4
 experiment2,0.4,0.7
+experiment3,NaN,0.4
+experiment4,0.2,NaN

data/validation/genELexperiment3_04.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "experiment3"}

data/validation/genELexperiment4_02.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"ratio_top_20_variants": 0.2, "log": "experiment4"}

gedi/benchmark.py CHANGED Viewed

@@ -109,7 +109,7 @@ class BenchmarkTest:
         results['log'] = log_name
         print(f"    SUCCESS: {miner} miner for {results} took {dt.now()-start_miner} sec.")
-        dump_features_json(results, dump_path, log_name, content_type="benchmark")
         return
     def split_miner_wrapper(self, log_path="data/real_event_logs/BPI_Challenges/BPI_Challenge_2012.xes"):

         results['log'] = log_name
         print(f"    SUCCESS: {miner} miner for {results} took {dt.now()-start_miner} sec.")
+        dump_features_json(results, os.path.join(dump_path, log_name), content_type="benchmark")
         return
     def split_miner_wrapper(self, log_path="data/real_event_logs/BPI_Challenges/BPI_Challenge_2012.xes"):

gedi/features.py CHANGED Viewed

@@ -159,6 +159,6 @@ class EventLogFeatures(EventLogFile):
         identifier = file.rsplit(".", 1)[0]
         print(f"  DONE: {file_path}. FEEED computed {feature_set}")
-        dump_features_json(features, self.root_path, identifier)
         return features

         identifier = file.rsplit(".", 1)[0]
         print(f"  DONE: {file_path}. FEEED computed {feature_set}")
+        dump_features_json(features, os.path.join(self.root_path,identifier))
         return features

gedi/generator.py CHANGED Viewed

@@ -19,7 +19,8 @@ from pm4py.sim import play_out
 from smac import HyperparameterOptimizationFacade, Scenario
 from utils.param_keys import OUTPUT_PATH, INPUT_PATH
 from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, CONFIG_SPACE, N_TRIALS
-from gedi.utils.io_helpers import get_output_key_value_location, dump_features_json, read_csvs
 import xml.etree.ElementTree as ET
 import re
 from xml.dom import minidom
@@ -80,7 +81,7 @@ def removeextralines(elem):
             element.tail=""
         if not re.search(hasWords,str(element.text)):
             element.text = ""
 def add_extension_before_traces(xes_file):
     # Register the namespace
     ET.register_namespace('', "http://www.xes-standard.org/")
@@ -158,12 +159,13 @@ class GenerateEventLogs():
             tasks=tasks.rename(columns={"ratio_variants_per_number_of_traces": "ratio_unique_traces_per_trace"})
         if tasks is not None:
             num_cores = multiprocessing.cpu_count() if len(tasks) >= multiprocessing.cpu_count() else len(tasks)
             #self.generator_wrapper([*tasks.iterrows()][0])# For testing
             with multiprocessing.Pool(num_cores) as p:
                 print(f"INFO: Generator starting at {start.strftime('%H:%M:%S')} using {num_cores} cores for {len(tasks)} tasks...")
                 random.seed(RANDOM_SEED)
-                log_config = p.map(self.generator_wrapper, tasks.iterrows())
             self.log_config = log_config
         else:
@@ -192,7 +194,7 @@ class GenerateEventLogs():
         except IndexError:
             identifier = task[0]+1
         task = task[1].loc[lambda x, identifier=identifier: x!=identifier]
-        self.objectives = task.to_dict()
         random.seed(RANDOM_SEED)
         self.configs = self.optimize()
@@ -207,8 +209,8 @@ class GenerateEventLogs():
         if self.objectives.get('ratio_unique_traces_per_trace'):#HOTFIX
             self.objectives['ratio_variants_per_number_of_traces']=self.objectives.pop('ratio_unique_traces_per_trace')
-        save_path = get_output_key_value_location(self.objectives,
-                                         self.output_path, identifier)+".xes"
         write_xes(log_config['log'], save_path)
         add_extension_before_traces(save_path)
@@ -219,7 +221,7 @@ class GenerateEventLogs():
         if features_to_dump.get('ratio_unique_traces_per_trace'):#HOTFIX
             features_to_dump['ratio_variants_per_number_of_traces']=features_to_dump.pop('ratio_unique_traces_per_trace')
         features_to_dump['log'] = identifier.replace('genEL', '')
-        dump_features_json(features_to_dump, self.output_path, identifier, objectives=self.objectives)
         return log_config
     def generate_optimized_log(self, config):

 from smac import HyperparameterOptimizationFacade, Scenario
 from utils.param_keys import OUTPUT_PATH, INPUT_PATH
 from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, CONFIG_SPACE, N_TRIALS
+from gedi.utils.io_helpers import get_output_key_value_location, dump_features_json
+from gedi.utils.io_helpers import read_csvs
 import xml.etree.ElementTree as ET
 import re
 from xml.dom import minidom
             element.tail=""
         if not re.search(hasWords,str(element.text)):
             element.text = ""
 def add_extension_before_traces(xes_file):
     # Register the namespace
     ET.register_namespace('', "http://www.xes-standard.org/")
             tasks=tasks.rename(columns={"ratio_variants_per_number_of_traces": "ratio_unique_traces_per_trace"})
         if tasks is not None:
+            self.feature_keys = sorted([feature for feature in tasks.columns.tolist() if feature != "log"])
             num_cores = multiprocessing.cpu_count() if len(tasks) >= multiprocessing.cpu_count() else len(tasks)
             #self.generator_wrapper([*tasks.iterrows()][0])# For testing
             with multiprocessing.Pool(num_cores) as p:
                 print(f"INFO: Generator starting at {start.strftime('%H:%M:%S')} using {num_cores} cores for {len(tasks)} tasks...")
                 random.seed(RANDOM_SEED)
+                log_config = p.map(self.generator_wrapper, [(index, row) for index, row in tasks.iterrows()])
             self.log_config = log_config
         else:
         except IndexError:
             identifier = task[0]+1
         task = task[1].loc[lambda x, identifier=identifier: x!=identifier]
+        self.objectives = task.dropna().to_dict()
         random.seed(RANDOM_SEED)
         self.configs = self.optimize()
         if self.objectives.get('ratio_unique_traces_per_trace'):#HOTFIX
             self.objectives['ratio_variants_per_number_of_traces']=self.objectives.pop('ratio_unique_traces_per_trace')
+        save_path = get_output_key_value_location(task.to_dict(),
+                                         self.output_path, identifier, self.feature_keys)+".xes"
         write_xes(log_config['log'], save_path)
         add_extension_before_traces(save_path)
         if features_to_dump.get('ratio_unique_traces_per_trace'):#HOTFIX
             features_to_dump['ratio_variants_per_number_of_traces']=features_to_dump.pop('ratio_unique_traces_per_trace')
         features_to_dump['log'] = identifier.replace('genEL', '')
+        dump_features_json(features_to_dump, save_path)
         return log_config
     def generate_optimized_log(self, config):

gedi/utils/io_helpers.py CHANGED Viewed

@@ -52,9 +52,10 @@ def get_keys_abbreviation(obj_keys):
         abbreviated_keys.append(abbreviated_key)
     return '_'.join(abbreviated_keys)
-def get_output_key_value_location(obj, output_path, identifier):
     obj_sorted = dict(sorted(obj.items()))
-    obj_keys = [*obj_sorted.keys()]
     obj_values = [round(x, 4) for x in [*obj_sorted.values()]]
@@ -71,15 +72,11 @@ def get_output_key_value_location(obj, output_path, identifier):
     save_path = os.path.join(folder_path, generated_file_name)
     return save_path
-def dump_features_json(features: dict, output_path, identifier, objectives=None, content_type="features"):
-    output_parts = PurePath(output_path).parts
-    feature_dir = os.path.join(output_parts[0], content_type,
                                    *output_parts[1:])
-    if objectives is not None:
-        json_path = get_output_key_value_location(objectives,
-                                                feature_dir, identifier)+".json"
-    else:
-        json_path = os.path.join(feature_dir, identifier)+".json"
     os.makedirs(os.path.split(json_path)[0], exist_ok=True)
     with open(json_path, 'w') as fp:

         abbreviated_keys.append(abbreviated_key)
     return '_'.join(abbreviated_keys)
+def get_output_key_value_location(obj, output_path, identifier, obj_keys=None):
     obj_sorted = dict(sorted(obj.items()))
+    if obj_keys is None:
+        obj_keys = [*obj_sorted.keys()]
     obj_values = [round(x, 4) for x in [*obj_sorted.values()]]
     save_path = os.path.join(folder_path, generated_file_name)
     return save_path
+def dump_features_json(features: dict, output_path, content_type="features"):
+    output_parts = PurePath(output_path.split(".xes")[0]).parts
+    features_path = os.path.join(output_parts[0], content_type,
                                    *output_parts[1:])
+    json_path = features_path+'.json'
     os.makedirs(os.path.split(json_path)[0], exist_ok=True)
     with open(json_path, 'w') as fp:

utils/merge_jsons.py CHANGED Viewed

@@ -4,12 +4,12 @@ import csv
 import os
 """
-Run using:
 python merge_jsons.py path_to_your_json_directory output.csv
 """
 def json_to_csv(json_dir, output_csv):
     json_files = [os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')]
     # Collect data from all JSON files
@@ -18,13 +18,13 @@ def json_to_csv(json_dir, output_csv):
         with open(json_file, 'r') as f:
             data = json.load(f)
             all_data.append(data)
     # Extract the headers from the first JSON object
     if all_data:
-        headers = all_data[0].keys()
     else:
         raise ValueError("No data found in JSON files")
     # Write data to CSV
     with open(output_csv, 'w', newline='') as f:
         writer = csv.DictWriter(f, fieldnames=headers)

 import os
 """
+Run using:
 python merge_jsons.py path_to_your_json_directory output.csv
 """
 def json_to_csv(json_dir, output_csv):
     json_files = [os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')]
     # Collect data from all JSON files
         with open(json_file, 'r') as f:
             data = json.load(f)
             all_data.append(data)
     # Extract the headers from the first JSON object
     if all_data:
+        headers = {elem for s in [set(i) for i in [d.keys() for d in all_data]] for elem in s}
     else:
         raise ValueError("No data found in JSON files")
     # Write data to CSV
     with open(output_csv, 'w', newline='') as f:
         writer = csv.DictWriter(f, fieldnames=headers)