Spaces:
Running
Running
import multiprocessing | |
import os | |
import pandas as pd | |
import random | |
from ConfigSpace import Configuration, ConfigurationSpace | |
from datetime import datetime as dt | |
from feeed.activities import Activities as activities | |
from feeed.end_activities import EndActivities as end_activities | |
from feeed.epa_based import Epa_based as epa_based | |
from feeed.eventropies import Eventropies as eventropies | |
from feeed.feature_extractor import feature_type | |
from feeed.simple_stats import SimpleStats as simple_stats | |
from feeed.start_activities import StartActivities as start_activities | |
from feeed.trace_length import TraceLength as trace_length | |
from feeed.trace_variant import TraceVariant as trace_variant | |
from pm4py import generate_process_tree | |
from pm4py import write_xes | |
from pm4py.sim import play_out | |
from smac import HyperparameterOptimizationFacade, Scenario | |
from gedi.utils.column_mappings import column_mappings | |
from gedi.utils.io_helpers import get_output_key_value_location, dump_features_json, compute_similarity | |
from gedi.utils.io_helpers import read_csvs | |
from gedi.utils.param_keys import OUTPUT_PATH, INPUT_PATH | |
from gedi.utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, CONFIG_SPACE, N_TRIALS | |
import xml.etree.ElementTree as ET | |
import re | |
from xml.dom import minidom | |
from functools import partial | |
""" | |
Parameters | |
-------------- | |
parameters | |
Parameters of the algorithm, according to the paper: | |
- Parameters.MODE: most frequent number of visible activities | |
- Parameters.MIN: minimum number of visible activities | |
- Parameters.MAX: maximum number of visible activities | |
- Parameters.SEQUENCE: probability to add a sequence operator to tree | |
- Parameters.CHOICE: probability to add a choice operator to tree | |
- Parameters.PARALLEL: probability to add a parallel operator to tree | |
- Parameters.LOOP: probability to add a loop operator to tree | |
- Parameters.OR: probability to add an or operator to tree | |
- Parameters.SILENT: probability to add silent activity to a choice or loop operator | |
- Parameters.DUPLICATE: probability to duplicate an activity label | |
- Parameters.NO_MODELS: number of trees to generate from model population | |
""" | |
RANDOM_SEED = 10 | |
random.seed(RANDOM_SEED) | |
def get_tasks(experiment, output_path="", reference_feature=None): | |
#Read tasks from file. | |
if isinstance(experiment, str) and experiment.endswith(".csv"): | |
tasks = pd.read_csv(experiment, index_col=None) | |
output_path=os.path.join(output_path,os.path.split(experiment)[-1].split(".")[0]) | |
if 'task' in tasks.columns: | |
tasks.rename(columns={"task":"log"}, inplace=True) | |
elif isinstance(experiment, str) and os.path.isdir(os.path.join(os.getcwd(), experiment)): | |
tasks = read_csvs(experiment, reference_feature) | |
#Read tasks from a real log features selection. | |
elif isinstance(experiment, dict) and INPUT_PATH in experiment.keys(): | |
output_path=os.path.join(output_path,os.path.split(experiment.get(INPUT_PATH))[-1].split(".")[0]) | |
tasks = pd.read_csv(experiment.get(INPUT_PATH), index_col=None) | |
id_col = tasks.select_dtypes(include=['object']).dropna(axis=1).columns[0] | |
if "objectives" in experiment.keys(): | |
incl_cols = experiment["objectives"] | |
tasks = tasks[(incl_cols + [id_col])] | |
# TODO: Solve/Catch error for different objective keys. | |
#Read tasks from config_file with list of targets | |
elif isinstance(experiment, list): | |
tasks = pd.DataFrame.from_dict(data=experiment) | |
#Read single tasks from config_file | |
elif isinstance(experiment, dict): | |
tasks = pd.DataFrame.from_dict(data=[experiment]) | |
else: | |
raise FileNotFoundError(f"{experiment} not found. Please check path in filesystem.") | |
return tasks, output_path | |
def removeextralines(elem): | |
hasWords = re.compile("\\w") | |
for element in elem.iter(): | |
if not re.search(hasWords,str(element.tail)): | |
element.tail="" | |
if not re.search(hasWords,str(element.text)): | |
element.text = "" | |
def add_extension_before_traces(xes_file): | |
# Register the namespace | |
ET.register_namespace('', "http://www.xes-standard.org/") | |
# Parse the original XML | |
tree = ET.parse(xes_file) | |
root = tree.getroot() | |
# Add extensions | |
extensions = [ | |
{'name': 'Lifecycle', 'prefix': 'lifecycle', 'uri': 'http://www.xes-standard.org/lifecycle.xesext'}, | |
{'name': 'Time', 'prefix': 'time', 'uri': 'http://www.xes-standard.org/time.xesext'}, | |
{'name': 'Concept', 'prefix': 'concept', 'uri': 'http://www.xes-standard.org/concept.xesext'} | |
] | |
for ext in extensions: | |
extension_elem = ET.Element('extension', ext) | |
root.insert(0, extension_elem) | |
# Add global variables | |
globals = [ | |
{ | |
'scope': 'event', | |
'attributes': [ | |
{'key': 'lifecycle:transition', 'value': 'complete'}, | |
{'key': 'concept:name', 'value': '__INVALID__'}, | |
{'key': 'time:timestamp', 'value': '1970-01-01T01:00:00.000+01:00'} | |
] | |
}, | |
{ | |
'scope': 'trace', | |
'attributes': [ | |
{'key': 'concept:name', 'value': '__INVALID__'} | |
] | |
} | |
] | |
for global_var in globals: | |
global_elem = ET.Element('global', {'scope': global_var['scope']}) | |
for attr in global_var['attributes']: | |
string_elem = ET.SubElement(global_elem, 'string', {'key': attr['key'], 'value': attr['value']}) | |
root.insert(len(extensions), global_elem) | |
# Pretty print the Xes | |
removeextralines(root) | |
xml_str = minidom.parseString(ET.tostring(root)).toprettyxml() | |
with open(xes_file, "w") as f: | |
f.write(xml_str) | |
class GenerateEventLogs(): | |
# TODO: Clarify nomenclature: experiment, task, objective as in notebook (https://github.com/lmu-dbs/gedi/blob/main/notebooks/grid_objectives.ipynb) | |
def __init__(self, params=None) -> None: | |
print("=========================== Generator ==========================") | |
if params is None: | |
default_params = {'generator_params': {'experiment': {'ratio_top_20_variants': 0.2, 'epa_normalized_sequence_entropy_linear_forgetting': 0.4}, 'config_space': {'mode': [5, 20], 'sequence': [0.01, 1], 'choice': [0.01, 1], 'parallel': [0.01, 1], 'loop': [0.01, 1], 'silent': [0.01, 1], 'lt_dependency': [0.01, 1], 'num_traces': [10, 101], 'duplicate': [0], 'or': [0]}, 'n_trials': 50}} | |
raise TypeError(f"Missing 'params'. Please provide a dictionary with generator parameters as so: {default_params}. See https://github.com/lmu-dbs/gedi for more info.") | |
print(f"INFO: Running with {params}") | |
start = dt.now() | |
if params.get(OUTPUT_PATH) is None: | |
self.output_path = 'data/generated' | |
else: | |
self.output_path = params.get(OUTPUT_PATH) | |
if not os.path.exists(self.output_path): | |
os.makedirs(self.output_path, exist_ok=True) | |
if self.output_path.endswith('csv'): | |
self.generated_features = pd.read_csv(self.output_path) | |
return | |
generator_params = params.get(GENERATOR_PARAMS) | |
experiment = generator_params.get(EXPERIMENT) | |
if experiment is not None: | |
tasks, output_path = get_tasks(experiment, self.output_path) | |
columns_to_rename = {col: column_mappings()[col] for col in tasks.columns if col in column_mappings()} | |
tasks = tasks.rename(columns=columns_to_rename) | |
self.output_path = output_path | |
if tasks is not None: | |
self.feature_keys = sorted([feature for feature in tasks.columns.tolist() if feature != "log"]) | |
num_cores = multiprocessing.cpu_count() if len(tasks) >= multiprocessing.cpu_count() else len(tasks) | |
#self.generator_wrapper([*tasks.iterrows()][0])# For testing | |
with multiprocessing.Pool(num_cores) as p: | |
print(f"INFO: Generator starting at {start.strftime('%H:%M:%S')} using {num_cores} cores for {len(tasks)} tasks...") | |
random.seed(RANDOM_SEED) | |
partial_wrapper = partial(self.generator_wrapper, generator_params=generator_params) | |
generated_features = p.map(partial_wrapper, [(index, row) for index, row in tasks.iterrows()]) | |
# TODO: Split log and metafeatures into separate object attributes | |
# TODO: Access not storing log in memory | |
# TODO: identify why log is needed in self.generated_features | |
self.generated_features = [ | |
{ | |
#'log': config.get('log'), | |
'metafeatures': config.get('metafeatures')} | |
for config in generated_features | |
if 'metafeatures' in config #and 'log' in config | |
] | |
else: | |
random.seed(RANDOM_SEED) | |
configs = self.optimize(generator_params=generator_params) | |
if type(configs) is not list: | |
configs = [configs] | |
temp = self.generate_optimized_log(configs[0]) | |
self.generated_features = [temp['metafeatures']] if 'metafeatures' in temp else [] | |
save_path = get_output_key_value_location(generator_params[EXPERIMENT], | |
self.output_path, "genEL")+".xes" | |
write_xes(temp['log'], save_path) | |
add_extension_before_traces(save_path) | |
print("SUCCESS: Saved generated event log in", save_path) | |
print(f"SUCCESS: Generator took {dt.now()-start} sec. Generated {len(self.generated_features)} event log(s).") | |
print(f" Saved generated logs in {self.output_path}") | |
print("========================= ~ Generator ==========================") | |
def clear(self): | |
print("Clearing parameters...") | |
self.generated_features = None | |
# self.configs = None | |
# self.params = None | |
self.output_path = None | |
self.feature_keys = None | |
def generator_wrapper(self, task, generator_params=None): | |
try: | |
identifier = [x for x in task[1] if isinstance(x, str)][0] | |
except IndexError: | |
identifier = task[0]+1 | |
identifier = "genEL" +str(identifier) | |
task = task[1].drop('log', errors='ignore') | |
self.objectives = task.dropna().to_dict() | |
random.seed(RANDOM_SEED) | |
configs = self.optimize(generator_params = generator_params) | |
random.seed(RANDOM_SEED) | |
if isinstance(configs, list): | |
generated_features = self.generate_optimized_log(configs[0]) | |
else: | |
generated_features = self.generate_optimized_log(configs) | |
save_path = get_output_key_value_location(task.to_dict(), | |
self.output_path, identifier, self.feature_keys)+".xes" | |
write_xes(generated_features['log'], save_path) | |
add_extension_before_traces(save_path) | |
print("SUCCESS: Saved generated event log in", save_path) | |
features_to_dump = generated_features['metafeatures'] | |
features_to_dump['log']= os.path.split(save_path)[1].split(".")[0] | |
# calculating the manhattan distance of the generated log to the target features | |
#features_to_dump['distance_to_target'] = calculate_manhattan_distance(self.objectives, features_to_dump) | |
features_to_dump['target_similarity'] = compute_similarity(self.objectives, features_to_dump) | |
dump_features_json(features_to_dump, save_path) | |
return generated_features | |
def generate_optimized_log(self, config): | |
''' Returns event log from given configuration''' | |
tree = generate_process_tree(parameters={ | |
"min": config["mode"], | |
"max": config["mode"], | |
"mode": config["mode"], | |
"sequence": config["sequence"], | |
"choice": config["choice"], | |
"parallel": config["parallel"], | |
"loop": config["loop"], | |
"silent": config["silent"], | |
"lt_dependency": config["lt_dependency"], | |
"duplicate": config["duplicate"], | |
"or": config["or"], | |
"no_models": 1 | |
}) | |
log = play_out(tree, parameters={"num_traces": config["num_traces"]}) | |
for i, trace in enumerate(log): | |
trace.attributes['concept:name'] = str(i) | |
for j, event in enumerate(trace): | |
event['time:timestamp'] = dt.now() | |
event['lifecycle:transition'] = "complete" | |
random.seed(RANDOM_SEED) | |
metafeatures = self.compute_metafeatures(log) | |
return { | |
"configuration": config, | |
"log": log, | |
"metafeatures": metafeatures, | |
} | |
def gen_log(self, config: Configuration, seed: int = 0): | |
random.seed(RANDOM_SEED) | |
tree = generate_process_tree(parameters={ | |
"min": config["mode"], | |
"max": config["mode"], | |
"mode": config["mode"], | |
"sequence": config["sequence"], | |
"choice": config["choice"], | |
"parallel": config["parallel"], | |
"loop": config["loop"], | |
"silent": config["silent"], | |
"lt_dependency": config["lt_dependency"], | |
"duplicate": config["duplicate"], | |
"or": config["or"], | |
"no_models": 1 | |
}) | |
random.seed(RANDOM_SEED) | |
log = play_out(tree, parameters={"num_traces": config["num_traces"]}) | |
random.seed(RANDOM_SEED) | |
result = self.eval_log(log) | |
return result | |
def compute_metafeatures(self, log): | |
for i, trace in enumerate(log): | |
trace.attributes['concept:name'] = str(i) | |
for j, event in enumerate(trace): | |
event['time:timestamp'] = dt.fromtimestamp(j * 1000) | |
event['lifecycle:transition'] = "complete" | |
metafeatures_computation = {} | |
for ft_name in self.objectives.keys(): | |
ft_type = feature_type(ft_name) | |
metafeatures_computation.update(eval(f"{ft_type}(feature_names=['{ft_name}']).extract(log)")) | |
return metafeatures_computation | |
def eval_log(self, log): | |
random.seed(RANDOM_SEED) | |
metafeatures = self.compute_metafeatures(log) | |
log_evaluation = {} | |
for key in self.objectives.keys(): | |
log_evaluation[key] = abs(self.objectives[key] - metafeatures[key]) | |
return log_evaluation | |
def optimize(self, generator_params): | |
if generator_params.get(CONFIG_SPACE) is None: | |
configspace = ConfigurationSpace({ | |
"mode": (5, 40), | |
"sequence": (0.01, 1), | |
"choice": (0.01, 1), | |
"parallel": (0.01, 1), | |
"loop": (0.01, 1), | |
"silent": (0.01, 1), | |
"lt_dependency": (0.01, 1), | |
"num_traces": (100, 1001), | |
"duplicate": (0), | |
"or": (0), | |
}) | |
print(f"WARNING: No config_space specified in config file. Continuing with {configspace}") | |
else: | |
configspace_lists = generator_params[CONFIG_SPACE] | |
configspace_tuples = {} | |
for k, v in configspace_lists.items(): | |
if len(v) == 1: | |
configspace_tuples[k] = v[0] | |
else: | |
configspace_tuples[k] = tuple(v) | |
configspace = ConfigurationSpace(configspace_tuples) | |
if generator_params.get(N_TRIALS) is None: | |
n_trials = 20 | |
print(f"INFO: Running with n_trials={n_trials}") | |
else: | |
n_trials = generator_params[N_TRIALS] | |
objectives = [*self.objectives.keys()] | |
# Scenario object specifying the multi-objective optimization environment | |
scenario = Scenario( | |
configspace, | |
deterministic=True, | |
n_trials=n_trials, | |
objectives=objectives, | |
n_workers=-1 | |
) | |
# Use SMAC to find the best configuration/hyperparameters | |
random.seed(RANDOM_SEED) | |
multi_obj = HyperparameterOptimizationFacade.get_multi_objective_algorithm( | |
scenario, | |
objective_weights=[1]*len(self.objectives), | |
) | |
random.seed(RANDOM_SEED) | |
smac = HyperparameterOptimizationFacade( | |
scenario=scenario, | |
target_function=self.gen_log, | |
multi_objective_algorithm=multi_obj, | |
# logging_level=False, | |
overwrite=True, | |
) | |
random.seed(RANDOM_SEED) | |
incumbent = smac.optimize() | |
return incumbent | |