Spaces:
Running
Running
File size: 8,791 Bytes
fcfe5f6 85d1e6f fcfe5f6 0d2306d fcfe5f6 10ab34e fcfe5f6 74f240f fcfe5f6 74f240f fcfe5f6 74f240f fcfe5f6 0d2306d fcfe5f6 74f240f fcfe5f6 74f240f fcfe5f6 08a0359 fcfe5f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import json
import multiprocessing
import pandas as pd
import os
from datetime import datetime as dt
from functools import partial
from feeed.feature_extractor import extract_features
from pathlib import Path
from utils.param_keys import INPUT_PATH
from utils.param_keys.features import FEATURE_PARAMS, FEATURE_SET
from gedi.utils.io_helpers import dump_features_json
from utils.column_mappings import column_mappings
def get_sortby_parameter(elem):
number = int(elem.rsplit(".")[0].rsplit("_", 1)[1])
return number
class EventLogFile:
def __init__(self, filename, folder_path):
self.root_path: Path = Path(folder_path)
self.filename: str = filename
@property
def filepath(self) -> str:
return str(os.path.join(self.root_path, self.filename))
class EventLogFeatures(EventLogFile):
def __init__(self, filename=None, folder_path='data/event_log', params=None, logs=None, ft_params=None):
super().__init__(filename, folder_path)
if ft_params == None:
self.params = None
self.feat = None
return
elif ft_params.get(FEATURE_PARAMS) == None:
self.params = {FEATURE_SET: None}
else:
#TODO: Replace hotfix
self.params=ft_params.get(FEATURE_PARAMS)
if 'ratio_variants_per_number_of_traces' in self.params.get(FEATURE_SET):#HOTFIX
self.params[FEATURE_SET] = ['ratio_unique_traces_per_trace'\
if feat=='ratio_variants_per_number_of_traces'\
else feat for feat in self.params.get(FEATURE_SET)]
# TODO: handle parameters in main, not in features. Move to main.py
if ft_params[INPUT_PATH]:
input_path = ft_params[INPUT_PATH]
if os.path.isfile(input_path):
self.root_path = Path(os.path.split(input_path)[0])
self.filename = os.path.split(input_path)[-1]
else:
self.root_path = Path(input_path)
# Check if directory exists, if not, create it
if not os.path.exists(input_path):
os.makedirs(input_path)
self.filename = sorted(os.listdir(input_path))
try:
start = dt.now()
print("=========================== EventLogFeatures Computation===========================")
print(f"INFO: Running with {ft_params}")
if str(self.filename).endswith('csv'): # Returns dataframe from loaded metafeatures file
self.feat = pd.read_csv(self.filepath)
columns_to_rename = {col: column_mappings()[col] for col in self.feat.columns if col in column_mappings()}
self.feat.rename(columns=columns_to_rename, inplace=True)
print(f"SUCCESS: EventLogFeatures loaded features from {self.filepath}")
elif isinstance(self.filename, list): # Computes metafeatures for list of .xes files
combined_features=pd.DataFrame()
if self.filename[0].endswith(".json"):
self.filename = [ filename for filename in self.filename if filename.endswith(".json")]
dfs = []
for filename in self.filename:
print(f"INFO: Reading features from {os.path.join(self.root_path, filename)}")
data = pd.read_json(str(os.path.join(self.root_path,filename)), lines=True)
#data['log']=filename.replace("genEL","").rsplit("_",2)[0]
#print(data)
dfs.append(data)
combined_features= pd.concat(dfs, ignore_index = True)
self.feat = combined_features
self.filename = os.path.split(self.root_path)[-1] + '_feat.csv'
self.root_path=Path(os.path.split(self.root_path)[0])
combined_features.to_csv(self.filepath, index=False)
print(f"SUCCESS: EventLogFeatures took {dt.now()-start} sec. Saved {len(self.feat.columns)} features for {len(self.feat)} in {self.filepath}")
print("=========================== ~ EventLogFeatures Computation=========================")
return
else:
self.filename = [ filename for filename in self.filename if filename.endswith(".xes")]
# TODO: only include xes logs in self.filename, otherwise it will result in less rows. Implement skip exception with warning
#self.extract_features_wrapper(self.filename[0], feature_set=self.params[FEATURE_SET]) #TESTING ONLY
try:
num_cores = multiprocessing.cpu_count() if len(
self.filename) >= multiprocessing.cpu_count() else len(self.filename)
with multiprocessing.Pool(num_cores) as p:
try:
print(
f"INFO: EventLogFeatures starting at {start.strftime('%H:%M:%S')} using {num_cores} cores for {len(self.filename)} files, namely {self.filename}...")
result = p.map(partial(self.extract_features_wrapper, feature_set = self.params[FEATURE_SET])
, self.filename)
result = [i for i in result if i is not None]
combined_features = pd.DataFrame.from_dict(result)
except Exception as e:
print(e)
except IndexError as error:
print("IndexError:", error)
for file in self.filename:
print(f"INFO: Computing features for {file}...")
features = self.extract_features_wrapper(str(os.path.join(self.root_path, file)),
feature_set = self.params[FEATURE_SET])
features['log'] = file.rsplit(".", 1)[0]
temp = pd.DataFrame.from_dict([features])
combined_features = pd.concat([combined_features, temp], ignore_index=True)
except KeyError as error:
print("Ignoring KeyError", error)
# Aggregates metafeatures in saved Jsons into dataframe
path_to_json = f"output/features/{str(self.root_path).split('/',1)[1]}"
df = pd.DataFrame()
# Iterate over the files in the directory
for filename in os.listdir(path_to_json):
if filename.endswith('.json'):
i_path = os.path.join(path_to_json, filename)
with open(i_path) as f:
data = json.load(f)
temp_df = pd.DataFrame([data])
df = pd.concat([df, temp_df])
combined_features = df
self.filename = os.path.split(self.root_path)[-1] + '_feat.csv'
self.root_path=Path(os.path.split(self.root_path)[0])
combined_features.to_csv(self.filepath, index=False)
self.feat = combined_features
except (IOError, FileNotFoundError) as err:
print(err)
print(f"Cannot load {self.filepath}. Double check for file or change config 'load_results' to false")
else:
print(f"SUCCESS: EventLogFeatures took {dt.now()-start} sec. Saved {len(self.feat.columns)-1} features for {len(self.feat)} in {self.filepath}")
print("=========================== ~ EventLogFeatures Computation=========================")
#TODO: Implement optional trying to read already computed jsons first.
def extract_features_wrapper(self, file, feature_set=None):
try:
file_path = os.path.join(self.root_path, file)
print(f" INFO: Starting FEEED for {file_path} and {feature_set}")
features = extract_features(file_path, feature_set)
#TODO: Replace hotfix
if features.get('ratio_unique_traces_per_trace'):#HOTFIX
features['ratio_variants_per_number_of_traces']=features.pop('ratio_unique_traces_per_trace')
except Exception as e:
print("ERROR: for ",file.rsplit(".", 1)[0], feature_set, "skipping and continuing with next log.")
print(e)
return None
identifier = file.rsplit(".", 1)[0]
print(f" DONE: {file_path}. FEEED computed {feature_set}")
dump_features_json(features, os.path.join(self.root_path,identifier))
return features
|