Spaces:
Running
Running
import json | |
import multiprocessing | |
import os | |
import pandas as pd | |
import subprocess | |
from datetime import datetime as dt | |
from functools import partialmethod | |
from itertools import repeat | |
from pm4py import convert_to_bpmn, read_bpmn, convert_to_petri_net, check_soundness | |
from pm4py import discover_petri_net_inductive, discover_petri_net_ilp, discover_petri_net_heuristics | |
from pm4py import fitness_alignments | |
from pm4py import precision_alignments | |
from pm4py.objects.bpmn.obj import BPMN | |
from pm4py.objects.log.importer.xes import importer as xes_importer | |
from gedi.utils.io_helpers import dump_features_json | |
from gedi.utils.param_keys import INPUT_PATH, OUTPUT_PATH | |
from gedi.utils.param_keys.benchmark import MINERS | |
from tqdm import tqdm | |
class BenchmarkTest: | |
def __init__(self, params=None, event_logs=None): | |
start = dt.now() | |
print("=========================== BenchmarkTest =============================") | |
print(f"INFO: Running with {params}") | |
if event_logs is None or len(event_logs) == 0: | |
log_path = params[INPUT_PATH] | |
if log_path.endswith(".xes"): | |
event_logs = [""] | |
else: | |
try: | |
event_logs =sorted([filename for filename in os.listdir(log_path) if filename.endswith(".xes")]) | |
except FileNotFoundError: | |
print(f" FAILED: Cannot find {params[INPUT_PATH]}" ) | |
return | |
if params != None: | |
self.params = params | |
log_counter = [*range(0,len(event_logs))] | |
if True: | |
num_cores = multiprocessing.cpu_count() if len( | |
event_logs) >= multiprocessing.cpu_count() else len(event_logs) | |
#self.benchmark_wrapper((event_logs[0],0), miners=self.params[MINERS])# TESTING | |
with multiprocessing.Pool(num_cores) as p: | |
print(f"INFO: Benchmark starting at {start.strftime('%H:%M:%S')} using {num_cores} cores for {len(event_logs)} files...") | |
p.starmap(self.benchmark_wrapper, zip(event_logs, log_counter, repeat(self.params[MINERS]))) | |
# Aggregates metafeatures in saved Jsons into dataframe | |
self.root_path = self.params[INPUT_PATH] | |
path_to_json = f"output/benchmark/{str(self.root_path).split('/',1)[1]}" | |
if path_to_json.endswith(".xes"): | |
path_to_json = path_to_json.rsplit("/",1)[0] | |
df = pd.DataFrame() | |
# Iterate over the files in the directory | |
for filename in sorted(os.listdir(path_to_json)): | |
if filename.endswith('.json'): | |
i_path = os.path.join(path_to_json, filename) | |
with open(i_path) as f: | |
data = json.load(f) | |
temp_df = pd.DataFrame([data]) | |
df = pd.concat([df, temp_df], ignore_index = True) | |
benchmark_results = df | |
#print(benchmark_results) | |
self.filename = os.path.split(self.root_path)[-1].replace(".xes","") + '_benchmark.csv' | |
self.filepath = os.path.join("output", "benchmark", self.filename) | |
os.makedirs(os.path.split(self.filepath)[0], exist_ok=True) | |
benchmark_results.to_csv(self.filepath, index=False) | |
self.results = benchmark_results | |
print(benchmark_results) | |
print(f"SUCCESS: BenchmarkTest took {dt.now()-start} sec for {len(params[MINERS])} miners"+\ | |
f" and {len(benchmark_results)} event-logs. Saved benchmark to {self.filepath}.") | |
print("========================= ~ BenchmarkTest =============================") | |
def benchmark_wrapper(self, event_log, log_counter=0, miners=['ind']): | |
dump_path = os.path.join(self.params[OUTPUT_PATH], | |
os.path.split(self.params[INPUT_PATH])[-1]) | |
dump_path= os.path.join(self.params[OUTPUT_PATH], | |
os.path.join(*os.path.normpath(self.params[INPUT_PATH]).split(os.path.sep)[1:])) | |
if dump_path.endswith(".xes"): | |
event_log = os.path.split(dump_path)[-1] | |
dump_path = os.path.split(dump_path)[0] | |
benchmark_results = pd.DataFrame() | |
if isinstance(event_log, str): | |
log_name = event_log.replace(".xes", "") | |
results = {'log': log_name} | |
else: | |
log_name = "gen_el_"+str(log_counter) | |
results = {"log": event_log} | |
for miner in miners: | |
miner_cols = [f"fitness_{miner}", f"precision_{miner}", f"fscore_{miner}", f"size_{miner}", f"cfc_{miner}", f"pnsize_{miner}"]# f"generalization_{miner}",f"simplicity_{miner}"] | |
start_miner = dt.now() | |
benchmark_results = [round(x, 4) for x in self.benchmark_discovery(results['log'], miner, self.params)] | |
results[f"fitness_{miner}"] = benchmark_results[0] | |
results[f"precision_{miner}"] = benchmark_results[1] | |
results[f"fscore_{miner}"] = round(2*(benchmark_results[0]*benchmark_results[1]/ | |
(benchmark_results[0]+ benchmark_results[1])), 4) | |
results[f"size_{miner}"]= benchmark_results[2] | |
results[f"pnsize_{miner}"]= benchmark_results[4] | |
results[f"cfc_{miner}"]= benchmark_results[3] | |
results['log'] = log_name | |
print(f" SUCCESS: {miner} miner for {results} took {dt.now()-start_miner} sec.") | |
dump_features_json(results, os.path.join(dump_path, log_name), content_type="benchmark") | |
return | |
def split_miner_wrapper(self, log_path="data/real_event_logs/BPI_Challenges/BPI_Challenge_2012.xes"): | |
jar_path = os.path.join("gedi","libs","split-miner-1.7.1-all.jar") | |
filename = os.path.split(log_path)[-1].rsplit(".",1)[0] | |
bpmn_path = os.path.join("output", "bpmns_split", filename) | |
os.makedirs(os.path.split(bpmn_path)[0], exist_ok=True) | |
command = [ | |
"java", | |
"-cp", | |
f"{os.getcwd()}/gedi/libs/sm2.jar:{os.getcwd()}/tag/libs/lib/*", | |
"au.edu.unimelb.services.ServiceProvider", | |
"SM2", | |
f"{os.getcwd()}/{log_path}", | |
f"{os.getcwd()}/{bpmn_path}", | |
"0.05" | |
] | |
print("COMMAND", " ".join(command)) | |
output = subprocess.run( | |
command, | |
capture_output=True, | |
text=True, | |
) | |
try: | |
if "\nERROR:" in output.stdout: | |
print("FAILED: SplitMiner could not create BPMN for", log_path) | |
print(" SplitMiner:", output.stderr) | |
return None | |
return read_bpmn(bpmn_path+'.bpmn') | |
except ValueError: | |
print(output.stdout) | |
def benchmark_discovery(self, log, miner, params=None): | |
""" | |
Runs discovery algorithms on a specific log and returns their performance. | |
:param str/EventLog log: log from pipeline step before or string to .xes file. | |
:param str miner: Specifies process discovery miner to be run on log. | |
:param Dict params: Params from config file | |
""" | |
#print("Running benchmark_discovery with", self, log, miner, params) | |
NOISE_THRESHOLD = 0.2 | |
miner_params='' | |
tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) | |
start_bench = dt.now() | |
if type(log) is str: | |
if params[INPUT_PATH].endswith('.xes'): | |
log_path = params[INPUT_PATH] | |
else: | |
log_path = os.path.join(params[INPUT_PATH], log+".xes") | |
success_msg = f" SUCCESS: Benchmarking event-log {log} with {miner} took "# {dt.now()-start_bench} sec." | |
try: | |
log = xes_importer.apply(f"{log_path}", parameters={"show_progress_bar": False}) | |
except FileNotFoundError: | |
print(f" FAILED: Cannot find {log_path}" ) | |
else: | |
log=log | |
success_msg = f" SUCCESS: Benchmarking one event-log with {miner} took "# {dt.now()-start_bench} sec." | |
if miner == 'sm': | |
bpmn_graph = self.split_miner_wrapper(log_path) | |
if bpmn_graph is None: | |
return None | |
'''TESTING | |
from pm4py.visualization.bpmn.visualizer import apply as get_bpmn_fig | |
from pm4py.visualization.bpmn.visualizer import matplotlib_view as view_bpmn_fig | |
bpmn_fig = get_bpmn_fig(bpmn_graph) | |
view_bpmn_fig(bpmn_fig) | |
''' | |
net, im, fm = convert_to_petri_net(bpmn_graph) | |
is_sound = check_soundness(net, im, fm) | |
else: | |
if miner == 'imf': | |
miner = 'inductive' | |
miner_params = f', noise_threshold={NOISE_THRESHOLD}' | |
elif miner == 'ind': | |
miner = 'inductive' | |
elif miner == 'heu': | |
miner = 'heuristics' | |
net, im, fm = eval(f"discover_petri_net_{miner}(log {miner_params})") | |
bpmn_graph = convert_to_bpmn(net, im, fm) | |
fitness = fitness_alignments(log, net, im, fm)['log_fitness'] | |
precision = precision_alignments(log, net, im, fm) | |
pn_size = len(net._PetriNet__places) | |
size = len(bpmn_graph._BPMN__nodes) | |
cfc = sum([isinstance(node, BPMN.ExclusiveGateway) for node in bpmn_graph._BPMN__nodes]) | |
#generalization = generalization_evaluator.apply(log, net, im, fm) | |
#simplicity = simplicity_evaluator.apply(net) | |
print(success_msg + f"{dt.now()-start_bench} sec.") | |
return fitness, precision, size, cfc, pn_size#, generalization, simplicity | |