Andrea Maldonado commited on
Commit
e614e81
·
1 Parent(s): 4927cc1

Updates feature computation using compute_features_from_event_data

Browse files
Files changed (2) hide show
  1. gedi/features.py +59 -20
  2. gedi/run.py +3 -3
gedi/features.py CHANGED
@@ -2,21 +2,57 @@ import json
2
  import multiprocessing
3
  import pandas as pd
4
  import os
 
5
 
6
  from datetime import datetime as dt
7
  from functools import partial
 
 
 
 
8
  from feeed.feature_extractor import extract_features
9
- from pathlib import Path
 
 
 
 
10
  from gedi.utils.column_mappings import column_mappings
11
  from gedi.utils.io_helpers import dump_features_json
12
  from gedi.utils.param_keys import INPUT_PATH
13
  from gedi.utils.param_keys.features import FEATURE_PARAMS, FEATURE_SET
14
-
15
- #TODO: replace with other feature file
16
- def get_sortby_parameter(elem):
17
- number = int(elem.rsplit(".")[0].rsplit("_", 1)[1])
18
- return number
19
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  class EventLogFile:
22
  def __init__(self, filename, folder_path):
@@ -27,7 +63,7 @@ class EventLogFile:
27
  def filepath(self) -> str:
28
  return str(os.path.join(self.root_path, self.filename))
29
 
30
- class EventLogFeatures(EventLogFile):
31
  def __init__(self, filename=None, folder_path='data/event_log', params=None, logs=None, ft_params=None):
32
  super().__init__(filename, folder_path)
33
  if ft_params == None:
@@ -54,17 +90,18 @@ class EventLogFeatures(EventLogFile):
54
 
55
  try:
56
  start = dt.now()
57
- print("=========================== EventLogFeatures Computation===========================")
58
 
59
  print(f"INFO: Running with {ft_params}")
60
 
61
- if str(self.filename).endswith('csv'): # Returns dataframe from loaded metafeatures file
62
  self.feat = pd.read_csv(self.filepath)
63
  columns_to_rename = {col: column_mappings()[col] for col in self.feat.columns if col in column_mappings()}
64
  self.feat.rename(columns=columns_to_rename, inplace=True)
65
- print(f"SUCCESS: EventLogFeatures loaded features from {self.filepath}")
66
- elif isinstance(self.filename, list): # Computes metafeatures for list of .xes files
67
  combined_features=pd.DataFrame()
 
68
  if self.filename[0].endswith(".json"):
69
  self.filename = [ filename for filename in self.filename if filename.endswith(".json")]
70
  dfs = []
@@ -80,8 +117,8 @@ class EventLogFeatures(EventLogFile):
80
  self.filename = os.path.split(self.root_path)[-1] + '_feat.csv'
81
  self.root_path=Path(os.path.split(self.root_path)[0])
82
  combined_features.to_csv(self.filepath, index=False)
83
- print(f"SUCCESS: EventLogFeatures took {dt.now()-start} sec. Saved {len(self.feat.columns)} features for {len(self.feat)} in {self.filepath}")
84
- print("=========================== ~ EventLogFeatures Computation=========================")
85
  return
86
  else:
87
  self.filename = [ filename for filename in self.filename if filename.endswith(".xes")]
@@ -94,7 +131,7 @@ class EventLogFeatures(EventLogFile):
94
  with multiprocessing.Pool(num_cores) as p:
95
  try:
96
  print(
97
- f"INFO: EventLogFeatures starting at {start.strftime('%H:%M:%S')} using {num_cores} cores for {len(self.filename)} files, namely {self.filename}...")
98
  result = p.map(partial(self.extract_features_wrapper, feature_set = self.params[FEATURE_SET])
99
  , self.filename)
100
  result = [i for i in result if i is not None]
@@ -114,7 +151,7 @@ class EventLogFeatures(EventLogFile):
114
 
115
  except KeyError as error:
116
  print("Ignoring KeyError", error)
117
- # Aggregates metafeatures in saved Jsons into dataframe
118
  path_to_json = f"output/features/{str(self.root_path).split('/',1)[1]}"
119
  df = pd.DataFrame()
120
  # Iterate over the files in the directory
@@ -137,16 +174,19 @@ class EventLogFeatures(EventLogFile):
137
  print(f"Cannot load {self.filepath}. Double check for file or change config 'load_results' to false")
138
  else:
139
  # -2 because of 'log' and 'similarity'
140
- print(f"SUCCESS: EventLogFeatures took {dt.now()-start} sec. Saved {len(self.feat.columns)-2} features for {len(self.feat)} in {self.filepath}")
141
- print("=========================== ~ EventLogFeatures Computation=========================")
142
 
143
  #TODO: Implement optional trying to read already computed jsons first.
144
  def extract_features_wrapper(self, file, feature_set=None):
145
  try:
146
  file_path = os.path.join(self.root_path, file)
147
  print(f" INFO: Starting FEEED for {file_path} and {feature_set}")
148
- features = extract_features(file_path, feature_set)
149
 
 
 
 
 
150
  except Exception as e:
151
  print("ERROR: for ",file.rsplit(".", 1)[0], feature_set, "skipping and continuing with next log.")
152
  print(e)
@@ -156,4 +196,3 @@ class EventLogFeatures(EventLogFile):
156
  print(f" DONE: {file_path}. FEEED computed {feature_set}")
157
  dump_features_json(features, os.path.join(self.root_path,identifier))
158
  return features
159
-
 
2
  import multiprocessing
3
  import pandas as pd
4
  import os
5
+ import re
6
 
7
  from datetime import datetime as dt
8
  from functools import partial
9
+ from feeed.activities import Activities as activities
10
+ from feeed.end_activities import EndActivities as end_activities
11
+ from feeed.epa_based import Epa_based as epa_based
12
+ from feeed.eventropies import Eventropies as eventropies
13
  from feeed.feature_extractor import extract_features
14
+ from feeed.feature_extractor import feature_type, read_pm4py_log
15
+ from feeed.simple_stats import SimpleStats as simple_stats
16
+ from feeed.start_activities import StartActivities as start_activities
17
+ from feeed.trace_length import TraceLength as trace_length
18
+ from feeed.trace_variant import TraceVariant as trace_variant
19
  from gedi.utils.column_mappings import column_mappings
20
  from gedi.utils.io_helpers import dump_features_json
21
  from gedi.utils.param_keys import INPUT_PATH
22
  from gedi.utils.param_keys.features import FEATURE_PARAMS, FEATURE_SET
23
+ from pathlib import Path
24
+ from pm4py.objects.log.obj import EventLog
25
+
26
+ def _is_feature_class(name: str) -> bool:
27
+ try:
28
+ if re.match(r'^[A-Z][a-z]*([A-Z][a-z]*)*$', name):
29
+ #print("PASCAL CASE", name)
30
+ snake_case_name = re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()
31
+ return hasattr(eval(snake_case_name+"()"), 'available_class_methods')
32
+ elif re.match(r'^[a-z]+(_[a-z]+)*$', name):
33
+ #print("SNAKE CASE", name)
34
+ return hasattr(eval(name+"()"), 'available_class_methods')
35
+ else:
36
+ return False
37
+ except NameError:
38
+ return False
39
+
40
+ def get_feature_type(ft_name):
41
+ ft_type = feature_type(ft_name)
42
+ return ft_type
43
+
44
+ def compute_features_from_event_data(feature_set, event_data: EventLog):
45
+ features_computation = {}
46
+ for ft_name in feature_set:
47
+ #print("FEATURE_SET", feature_set)
48
+ ft_type = get_feature_type(ft_name)
49
+ #print(f"INFO: Computing {ft_type}: {ft_name}")
50
+ computation_command = f"{ft_type}("
51
+ if ft_type != ft_name:
52
+ computation_command += f"feature_names=['{ft_name}'],"
53
+ computation_command += f").extract(event_data)"
54
+ features_computation.update(eval(computation_command))
55
+ return features_computation
56
 
57
  class EventLogFile:
58
  def __init__(self, filename, folder_path):
 
63
  def filepath(self) -> str:
64
  return str(os.path.join(self.root_path, self.filename))
65
 
66
+ class EventDataFeatures(EventLogFile):
67
  def __init__(self, filename=None, folder_path='data/event_log', params=None, logs=None, ft_params=None):
68
  super().__init__(filename, folder_path)
69
  if ft_params == None:
 
90
 
91
  try:
92
  start = dt.now()
93
+ print("=========================== EventDataFeatures Computation===========================")
94
 
95
  print(f"INFO: Running with {ft_params}")
96
 
97
+ if str(self.filename).endswith('csv'): # Returns dataframe from loaded features file
98
  self.feat = pd.read_csv(self.filepath)
99
  columns_to_rename = {col: column_mappings()[col] for col in self.feat.columns if col in column_mappings()}
100
  self.feat.rename(columns=columns_to_rename, inplace=True)
101
+ print(f"SUCCESS: EventDataFeatures loaded features from {self.filepath}")
102
+ elif isinstance(self.filename, list): # Computes features for list of .xes files
103
  combined_features=pd.DataFrame()
104
+ #TODO: Fix IndexError when running config_files/experiment_real_targets.json
105
  if self.filename[0].endswith(".json"):
106
  self.filename = [ filename for filename in self.filename if filename.endswith(".json")]
107
  dfs = []
 
117
  self.filename = os.path.split(self.root_path)[-1] + '_feat.csv'
118
  self.root_path=Path(os.path.split(self.root_path)[0])
119
  combined_features.to_csv(self.filepath, index=False)
120
+ print(f"SUCCESS: EventDataFeatures took {dt.now()-start} sec. Saved {len(self.feat.columns)} features for {len(self.feat)} in {self.filepath}")
121
+ print("=========================== ~ EventDataFeatures Computation=========================")
122
  return
123
  else:
124
  self.filename = [ filename for filename in self.filename if filename.endswith(".xes")]
 
131
  with multiprocessing.Pool(num_cores) as p:
132
  try:
133
  print(
134
+ f"INFO: EventDataFeatures starting at {start.strftime('%H:%M:%S')} using {num_cores} cores for {len(self.filename)} files, namely {self.filename}...")
135
  result = p.map(partial(self.extract_features_wrapper, feature_set = self.params[FEATURE_SET])
136
  , self.filename)
137
  result = [i for i in result if i is not None]
 
151
 
152
  except KeyError as error:
153
  print("Ignoring KeyError", error)
154
+ # Aggregates features in saved Jsons into dataframe
155
  path_to_json = f"output/features/{str(self.root_path).split('/',1)[1]}"
156
  df = pd.DataFrame()
157
  # Iterate over the files in the directory
 
174
  print(f"Cannot load {self.filepath}. Double check for file or change config 'load_results' to false")
175
  else:
176
  # -2 because of 'log' and 'similarity'
177
+ print(f"SUCCESS: EventDataFeatures took {dt.now()-start} sec. Saved {len(self.feat.columns)-2} features for {len(self.feat)} in {self.filepath}")
178
+ print("=========================== ~ EventDataFeatures Computation=========================")
179
 
180
  #TODO: Implement optional trying to read already computed jsons first.
181
  def extract_features_wrapper(self, file, feature_set=None):
182
  try:
183
  file_path = os.path.join(self.root_path, file)
184
  print(f" INFO: Starting FEEED for {file_path} and {feature_set}")
 
185
 
186
+ #NOTE: Current implementation saves features in "_feat.csv" within feeed in extract_features()
187
+ #log = read_pm4py_log(file_path)
188
+ #features = compute_features_from_event_data(feature_set, log)
189
+ features = extract_features(file_path, feature_set)
190
  except Exception as e:
191
  print("ERROR: for ",file.rsplit(".", 1)[0], feature_set, "skipping and continuing with next log.")
192
  print(e)
 
196
  print(f" DONE: {file_path}. FEEED computed {feature_set}")
197
  dump_features_json(features, os.path.join(self.root_path,identifier))
198
  return features
 
gedi/run.py CHANGED
@@ -4,7 +4,7 @@ from datetime import datetime as dt
4
  from gedi.augmentation import InstanceAugmentator
5
  from gedi.benchmark import BenchmarkTest
6
  from gedi.config import get_model_params_list
7
- from gedi.features import EventLogFeatures
8
  from gedi.generator import GenerateEventLogs
9
  from gedi.plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter
10
  from gedi.utils.default_argparse import ArgParser
@@ -22,7 +22,7 @@ def run(kwargs:dict, model_params_list: list, filename_list:list):
22
  @return:
23
  """
24
  params = kwargs[PARAMS]
25
- ft = EventLogFeatures(None)
26
  augmented_ft = InstanceAugmentator()
27
  gen = pd.DataFrame(columns=['metafeatures'])
28
 
@@ -38,7 +38,7 @@ def run(kwargs:dict, model_params_list: list, filename_list:list):
38
  benchmark = BenchmarkTest(model_params)#, event_logs=gen['log'])
39
  # BenchmarkPlotter(benchmark.features, output_path="output/plots")
40
  elif model_params.get(PIPELINE_STEP) == 'feature_extraction':
41
- ft = EventLogFeatures(**kwargs, ft_params=model_params)
42
  FeaturesPlotter(ft.feat, model_params)
43
  elif model_params.get(PIPELINE_STEP) == "evaluation_plotter":
44
  GenerationPlotter(gen, model_params, output_path=model_params['output_path'], input_path=model_params['input_path'])
 
4
  from gedi.augmentation import InstanceAugmentator
5
  from gedi.benchmark import BenchmarkTest
6
  from gedi.config import get_model_params_list
7
+ from gedi.features import EventDataFeatures
8
  from gedi.generator import GenerateEventLogs
9
  from gedi.plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter
10
  from gedi.utils.default_argparse import ArgParser
 
22
  @return:
23
  """
24
  params = kwargs[PARAMS]
25
+ ft = EventDataFeatures(None)
26
  augmented_ft = InstanceAugmentator()
27
  gen = pd.DataFrame(columns=['metafeatures'])
28
 
 
38
  benchmark = BenchmarkTest(model_params)#, event_logs=gen['log'])
39
  # BenchmarkPlotter(benchmark.features, output_path="output/plots")
40
  elif model_params.get(PIPELINE_STEP) == 'feature_extraction':
41
+ ft = EventDataFeatures(**kwargs, ft_params=model_params)
42
  FeaturesPlotter(ft.feat, model_params)
43
  elif model_params.get(PIPELINE_STEP) == "evaluation_plotter":
44
  GenerationPlotter(gen, model_params, output_path=model_params['output_path'], input_path=model_params['input_path'])