Spaces:
Running
Running
File size: 7,859 Bytes
bdf9096 99bcc04 bdf9096 5a889b3 bdf9096 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import json
import multiprocessing
import numpy as np
import pandas as pd
import os
from datetime import datetime as dt
from functools import partial
from feeed.feature_extractor import extract_features
from pathlib import Path, PurePath
from sklearn.impute import SimpleImputer
from utils.param_keys import INPUT_PATH
from utils.param_keys.features import FEATURE_PARAMS, FEATURE_SET
from gedi.utils.io_helpers import dump_features_json
def get_sortby_parameter(elem):
number = int(elem.rsplit(".")[0].rsplit("_", 1)[1])
return number
class EventLogFile:
def __init__(self, filename, folder_path):
self.root_path: Path = Path(folder_path)
self.filename: str = filename
@property
def filepath(self) -> str:
return str(os.path.join(self.root_path, self.filename))
class EventLogFeatures(EventLogFile):
def __init__(self, filename, folder_path='data/event_log', params=None, logs=None, ft_params=None):
super().__init__(filename, folder_path)
if ft_params == None:
self.params = None
self.feat = None
return
elif ft_params.get(FEATURE_PARAMS) == None:
self.params = {FEATURE_SET: None}
else:
self.params=ft_params.get(FEATURE_PARAMS)
# TODO: handle parameters in main, not in features. Move to main.py
if ft_params[INPUT_PATH]:
input_path = ft_params[INPUT_PATH]
if os.path.isfile(input_path):
self.root_path = Path(os.path.split(input_path)[0])
self.filename = os.path.split(input_path)[-1]
else:
self.root_path = Path(input_path)
# Check if directory exists, if not, create it
if not os.path.exists(input_path):
os.makedirs(input_path)
self.filename = os.listdir(input_path)
try:
start = dt.now()
print("=========================== EventLogFeatures Computation===========================")
print(f"INFO: Running with {ft_params}")
if str(self.filename).endswith('csv'): # Returns dataframe from loaded metafeatures file
self.feat = pd.read_csv(self.filepath)
print(f"SUCCESS: EventLogFeatures loaded features from {self.filepath}")
elif isinstance(self.filename, list): # Computes metafeatures for list of .xes files
combined_features=pd.DataFrame()
if self.filename[0].endswith(".json"):
self.filename = [ filename for filename in self.filename if filename.endswith(".json")]
dfs = []
for filename in self.filename:
print(f"INFO: Reading features from {os.path.join(self.root_path, filename)}")
data = pd.read_json(str(os.path.join(self.root_path,filename)), lines=True)
#data['log']=filename.replace("genEL","").rsplit("_",2)[0]
#print(data)
dfs.append(data)
combined_features= pd.concat(dfs, ignore_index = True)
self.feat = combined_features
self.filename = os.path.split(self.root_path)[-1] + '_feat.csv'
self.root_path=Path(os.path.split(self.root_path)[0])
combined_features.to_csv(self.filepath, index=False)
print(f"SUCCESS: EventLogFeatures took {dt.now()-start} sec. Saved {len(self.feat.columns)} features for {len(self.feat)} in {self.filepath}")
print("=========================== ~ EventLogFeatures Computation=========================")
return
else:
self.filename = [ filename for filename in self.filename if filename.endswith(".xes")]
# TODO: only include xes logs in self.filename, otherwise it will result in less rows. Implement skip exception with warning
try:
num_cores = multiprocessing.cpu_count() if len(
self.filename) >= multiprocessing.cpu_count() else len(self.filename)
with multiprocessing.Pool(num_cores) as p:
try:
print(
f"INFO: EventLogFeatures starting at {start.strftime('%H:%M:%S')} using {num_cores} cores for {len(self.filename)} files, namely {self.filename}...")
result = p.map(partial(self.extract_features_wrapper, feature_set = self.params[FEATURE_SET])
, self.filename)
result = [i for i in result if i is not None]
combined_features = pd.DataFrame.from_dict(result)
except Exception as e:
print(e)
except IndexError as error:
print("IndexError:", error)
for file in self.filename:
print(f"INFO: Computing features for {file}...")
features = self.extract_features_wrapper(str(os.path.join(self.root_path, file)),
feature_set = self.params[FEATURE_SET])
features['log'] = file.rsplit(".", 1)[0]
temp = pd.DataFrame.from_dict([features])
combined_features = pd.concat([combined_features, temp], ignore_index=True)
except KeyError as error:
print("Ignoring KeyError", error)
# Aggregates metafeatures in saved Jsons into dataframe
path_to_json = f"output/features/{str(self.root_path).split('/',1)[1]}"
df = pd.DataFrame()
# Iterate over the files in the directory
for filename in os.listdir(path_to_json):
if filename.endswith('.json'):
i_path = os.path.join(path_to_json, filename)
with open(i_path) as f:
data = json.load(f)
temp_df = pd.DataFrame([data])
df = pd.concat([df, temp_df])
combined_features = df
self.filename = os.path.split(self.root_path)[-1] + '_feat.csv'
self.root_path=Path(os.path.split(self.root_path)[0])
combined_features.to_csv(self.filepath, index=False)
self.feat = combined_features
except (IOError, FileNotFoundError) as err:
print(err)
print(f"Cannot load {self.filepath}. Double check for file or change config 'load_results' to false")
else:
print(f"SUCCESS: EventLogFeatures took {dt.now()-start} sec. Saved {len(self.feat.columns)-1} features for {len(self.feat)} in {self.filepath}")
print("=========================== ~ EventLogFeatures Computation=========================")
#TODO: Implement optional trying to read already computed jsons first.
def extract_features_wrapper(self, file, feature_set=None):
try:
file_path = os.path.join(self.root_path, file)
print(f" INFO: Starting FEEED for {file_path} and {feature_set}")
features = extract_features(file_path, feature_set)
except Exception as e:
print("ERROR: for ",file.rsplit(".", 1)[0], feature_set, "skipping and continuing with next log.")
print(e)
return None
identifier = file.rsplit(".", 1)[0]
print(f" DONE: {file_path}. FEEED computed {feature_set}")
dump_features_json(features, self.root_path, identifier)
return features
|