Spaces:

linpershey
/

process_mining

Running

process_mining / pm4py /algo /decision_mining /algorithm.py

kota

initial commit

e60e568 11 months ago

23 kB

	'''
	This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

	PM4Py is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	PM4Py is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with PM4Py. If not, see <https://www.gnu.org/licenses/>.
	'''
	import sys
	from copy import deepcopy, copy
	from enum import Enum
	from typing import Optional, Dict, Any, Union, Tuple
	import numpy as np

	from pm4py.algo.conformance.alignments.petri_net import algorithm as ali
	from pm4py.algo.conformance.alignments.petri_net.variants import state_equation_a_star as star
	from pm4py.algo.conformance.tokenreplay import algorithm as token_replay
	from pm4py.objects.conversion.log import converter as log_converter
	from pm4py.objects.log.obj import EventLog
	from pm4py.objects.petri_net import properties as petri_properties
	from pm4py.objects.petri_net.obj import PetriNet, Marking
	from pm4py.statistics.attributes.log.select import select_attributes_from_log_for_tree
	from pm4py.statistics.variants.log import get as variants_module
	from pm4py.util import constants, xes_constants
	from pm4py.util import exec_utils, pandas_utils
	from pm4py.visualization.decisiontree.util import dt_to_string
	import pandas as pd


	class Parameters(Enum):
	ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
	LABELS = "labels"


	def create_data_petri_nets_with_decisions(log: Union[EventLog, pd.DataFrame], net: PetriNet, initial_marking: Marking,
	final_marking: Marking) -> Tuple[PetriNet, Marking, Marking]:
	"""
	Given a Petri net, create a data Petri net with the decisions given for each place by the decision
	mining algorithm

	Parameters
	----------------
	log
	Event log
	net
	Petri net
	initial_marking
	Initial marking
	final_marking
	Final marking

	Returns
	------------------
	data_petri_net
	Data petri net
	initial_marking
	Initial marking (unchanged)
	final_marking
	Final marking (unchanged)
	"""
	all_conditions = {}
	all_variables = {}
	for place in net.places:
	try:
	clf, columns, targets = get_decision_tree(log, net, initial_marking, final_marking,
	decision_point=place.name,
	parameters={"labels": False})
	target_classes, variables = dt_to_string.apply(clf, columns)
	target_classes = {targets[int(k)]: v for k, v in target_classes.items()}
	variables = {targets[int(k)]: v for k, v in variables.items()}
	for k in target_classes.keys():
	all_conditions[k] = target_classes[k]
	all_variables[k] = variables[k]
	except:
	pass
	for trans in net.transitions:
	if trans.name in all_conditions:
	trans.properties[petri_properties.TRANS_GUARD] = all_conditions[trans.name]
	trans.properties[petri_properties.READ_VARIABLE] = all_variables[trans.name]
	trans.properties[petri_properties.WRITE_VARIABLE] = []
	return net, initial_marking, final_marking


	def get_decision_tree(log: Union[EventLog, pd.DataFrame], net: PetriNet, initial_marking: Marking, final_marking: Marking,
	decision_point=None, attributes=None,
	parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> Any:
	"""
	Gets a decision tree classifier on a specific point of the model

	Parameters
	--------------
	log
	Event log
	net
	Petri net
	initial_marking
	Initial marking
	final_marking
	Final marking
	decision_point
	Point of the process in which a decision happens:
	- if not specified, the method crashes, but provides a list of possible decision points
	- if specified, the method goes on and produce the decision tree
	attributes
	Attributes of the log. If not specified, then an automatic attribute selection
	is performed.
	parameters
	Parameters of the algorithm

	Returns
	---------------
	clf
	Decision tree
	feature_names
	The names of the features
	classes
	The classes
	"""
	from pm4py.util import ml_utils

	if parameters is None:
	parameters = {}

	X, y, targets = apply(log, net, initial_marking, final_marking, decision_point=decision_point,
	attributes=attributes, parameters=parameters)

	dt = ml_utils.DecisionTreeClassifier()
	dt = dt.fit(X, y)
	return dt, list(X.columns.values.tolist()), targets


	def apply(log: Union[EventLog, pd.DataFrame], net: PetriNet, initial_marking: Marking, final_marking: Marking, decision_point=None,
	attributes=None, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> Any:
	"""
	Gets the essential information (features, target class and names of the target class)
	in order to learn a classifier

	Parameters
	--------------
	log
	Event log
	net
	Petri net
	initial_marking
	Initial marking
	final_marking
	Final marking
	decision_point
	Point of the process in which a decision happens:
	- if not specified, the method crashes, but provides a list of possible decision points
	- if specified, the method goes on and produce the decision tree
	attributes
	Attributes of the log. If not specified, then an automatic attribute selection
	is performed.
	parameters
	Parameters of the algorithm

	Returns
	---------------
	X
	features
	y
	Target class
	class_name
	Target class names
	"""
	import pandas as pd

	if parameters is None:
	parameters = {}

	labels = exec_utils.get_param_value(Parameters.LABELS, parameters, True)

	activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
	if decision_point is None:
	decision_points_names = get_decision_points(net, labels=labels, parameters=parameters)
	raise Exception("please provide decision_point as argument of the method. Possible decision points: ",
	decision_points_names)
	if attributes is None:
	str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr = select_attributes_from_log_for_tree(log)
	attributes = list(str_ev_attr) + list(num_ev_attr)
	I, dp = get_decisions_table(log, net, initial_marking, final_marking, attributes=attributes,
	pre_decision_points=[decision_point], parameters=parameters)
	x_attributes = [a for a in attributes if not a == activity_key]
	str_attributes = set()
	non_str_attributes = set()
	x = []
	x2 = []
	y = []
	for el in I[decision_point]:
	for a, v in el[0].items():
	if a in x_attributes:
	if type(v) is str:
	str_attributes.add(a)
	else:
	non_str_attributes.add(a)
	x.append({a: v for a, v in el[0].items() if a in x_attributes and type(v) is str})
	x2.append({a: v for a, v in el[0].items() if a in x_attributes and type(v) is not str})
	y.append(el[1])
	X = pandas_utils.instantiate_dataframe(x)
	X = pd.get_dummies(data=X, columns=list(str_attributes))
	X2 = pandas_utils.instantiate_dataframe(x2)
	X = pandas_utils.concat([X, X2], axis=1)
	Y = pandas_utils.instantiate_dataframe(y, columns=["Name"])
	Y, targets = encode_target(Y, "Name")
	y = Y['Target']
	return X, y, targets


	def get_decisions_table(log0, net, initial_marking, final_marking, attributes=None, use_trace_attributes=False, k=1,
	pre_decision_points=None, trace_attributes=None, parameters=None):
	"""
	Gets a decision table out of a log and an accepting Petri net

	Parameters
	-----------------
	log0
	Event log
	net
	Petri net
	initial_marking
	Initial marking
	final_marking
	Final marking
	attributes
	List of attributes which are considered
	(if not provided, all the attributes are considered)
	use_trace_attributes
	Include trace attributes in the decision table
	k
	Number that determines the number of last activities to take into account
	pre_decision_points
	List of Strings of place Names that have to be considered as decision points.
	If not provided, the decision points are inferred from the Petri net
	trace_attributes
	List of trace attributes to consider
	parameters
	Possible parameters of the algorithm

	Returns
	--------------
	I
	decision table
	decision_points
	The decision points as places of the Petri net, which are the keys of a dictionary
	having as values the list of transitions that are target
	"""
	if parameters is None:
	parameters = {}

	labels = exec_utils.get_param_value(Parameters.LABELS, parameters, True)

	log = deepcopy(log0)
	log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters)

	if pre_decision_points != None:
	if not isinstance(pre_decision_points, list):
	print(
	"Error: The parameter pre_decision_points has to be a list of names of the places that have to be considered.")
	sys.exit()
	if len(pre_decision_points) == 0:
	print("Error: There must be at least one element in the list of pre_decision_points.")
	sys.exit()
	if attributes != None:
	if not isinstance(attributes, list):
	print(
	"Error: The parameter attributes has to be a list of names of event attributes that have to be considered.")
	sys.exit()
	if len(attributes) == 0:
	print("Error: There must be at least one element in the list of attributes.")
	sys.exit()
	if use_trace_attributes == False and trace_attributes != None and isinstance(trace_attributes, list):
	print(
	"Note: Since a list of considerable trace attributes is provided, and use_trace_attributes was set on False, we set it on True")
	use_trace_attributes = True
	if trace_attributes != None:
	if not isinstance(trace_attributes, list):
	print(
	"Error: The parameter trace_attributes has to be a list of names of trace attributes that have to be considered.")
	sys.exit()
	if len(trace_attributes) == 0:
	print("Error: There must be at least one element in the list of trace_attributes.")
	sys.exit()

	# alignment = ali.apply(log, net, initial_marking, final_marking, variant=True, parameters={star.PARAM_ALIGNMENT_RESULT_IS_SYNC_PROD_AWARE:True})
	decision_points = get_decision_points(net, pre_decision_points=pre_decision_points, parameters=parameters)
	decision_points_names = get_decision_points(net, labels=labels, pre_decision_points=pre_decision_points,
	parameters=parameters)
	if use_trace_attributes:
	# Made to ensure distinguishness between event and trace attributes.
	log = prepare_event_log(log)
	if attributes != None:
	attributes = prepare_attributes(attributes)

	if use_trace_attributes and trace_attributes == None:
	# if no list of trace attributes is provided, we create one
	trace_attributes = []
	if use_trace_attributes:
	for trace in log:
	trace_attributes += list(trace.attributes)
	trace_attributes = list(set(trace_attributes))
	if attributes == None:
	# if no list is given, every attribute of the events are considered
	attributes = []
	for trace in log:
	for event in trace:
	attributes += list(event.keys())
	attributes = list(set(attributes))
	I = get_attributes(log, decision_points,
	attributes, use_trace_attributes, trace_attributes,
	k, net, initial_marking, final_marking, decision_points_names, parameters=parameters)
	return (I, decision_points)


	def prepare_event_log(log):
	"""
	If trace attributes are considered, it is possible that trace attributes have the same name as event attributes.
	To tackle this issue, the attributes get renamed.
	For trace attributes, we add "t_" at the beginning of the dictionary keys.
	For event attributes, we add "e_" at the beginning of the dict keys.
	:param log:
	:return:
	"""
	for trace in log:
	attributes = trace.attributes.copy()
	for attribute in attributes:
	trace.attributes["t_" + attribute] = trace.attributes.pop(attribute)
	for event in trace:
	attributes = event._dict.copy()
	for attribute in attributes:
	event._dict["e_" + attribute] = event._dict.pop(attribute)
	return log


	def prepare_attributes(attributes):
	"""
	Method that "e_" in front of every attribute if trace attributes are considered.
	:param attributes: List of event attributes that the user wants to consider.
	:return: list of edited attribute names
	"""
	new_attributes = []
	for attribute in attributes:
	new_attributes.append("e_" + attribute)
	return new_attributes


	def get_decision_points(net, labels=False, pre_decision_points=None, parameters=None):
	"""
	The goal is to get all decision places. These are places where there are at least two outgoing arcs.
	:param net: Petri Net where decision points are discovered (places with at least two outgoing arcs)
	:param labels: If someone wants to get the labels of the transitions after a decision point and not the "ID"
	:return:
	"""
	if parameters is None:
	parameters = {}
	counter = {}
	for place in net.places:
	counter[place.name] = []
	for arc in net.arcs:
	if arc.source in net.places:
	if labels == True:
	counter[arc.source.name].append(arc.target.label)
	else:
	counter[arc.source.name].append(arc.target.name)
	decision_points = {key: val for key, val in counter.items() if len(val) >= 2}
	i = 0
	# i counts how many given decision points of the user are detected
	if pre_decision_points != None:
	for el in list(decision_points):
	if el in pre_decision_points:
	i += 1
	else:
	del decision_points[el]
	if i == len(pre_decision_points):
	# print("All given decision points were identified as decision points in the Petri Net.")
	pass
	elif i == 0:
	raise Exception("None of the given points is a decision point.")
	# sys.exit()
	else:
	print(
	"Not all of the given places were identified as decision points. However, we only take the correct decision points from your list into account.")
	return decision_points


	def simplify_token_replay(replay):
	variant = {}
	for element in replay:
	if tuple(element['activated_transitions']) not in variant:
	variant[tuple(element['activated_transitions'])] = True
	smaller_replay = []
	for element in replay:
	if variant[tuple(element['activated_transitions'])]:
	smaller_replay.append(element)
	variant[tuple(element['activated_transitions'])] = False
	return smaller_replay


	def get_attributes(log, decision_points, attributes, use_trace_attributes, trace_attributes, k, net, initial_marking,
	final_marking, decision_points_names, parameters=None):
	"""
	This method aims to construct for each decision place a table where for each decision place a list if given with the
	label of the later decision and as value the given attributes
	:param log: Log on which the method is applied
	:param alignments: Computed alignments for a log and a model
	:param decision_points: Places that have multiple outgoing arcs
	:param attributes: Attributes that are considered
	:param use_trace_attributes: If trace attributes have to be considered or not
	:param trace_attributes: List of trace attributes that are considered
	:param k: Taking k last activities into account
	:return: Dictionary that has as keys the decision places. The value for this key is a list.
	The content of these lists are tuples. The first element of these tuples is information regrading the attributes,
	the second element of these tuples is the transition which chosen in a decision.
	"""
	if parameters is None:
	parameters = {}
	labels = exec_utils.get_param_value(Parameters.LABELS, parameters, True)

	I = {}
	for key in decision_points:
	I[key] = []
	A = {}
	for attri in attributes:
	A[attri] = None
	i = 0
	# first, take a look at the variants
	variants_idxs = variants_module.get_variants_from_log_trace_idx(log, parameters=parameters)
	one_variant = []
	for variant in variants_idxs:
	one_variant.append(variant)
	# TODO: Token based replay code mit paramter für nur varianten einbeziehen ausstatten
	replay_result = token_replay.apply(log, net, initial_marking, final_marking, parameters=parameters)
	replay_result = simplify_token_replay(replay_result)
	count = 0
	for variant in replay_result:
	if variant['trace_fitness'] == 1.0:
	for trace_index in variants_idxs[one_variant[count]]:
	last_k_list = [None] * k
	trace = log[trace_index]
	if use_trace_attributes:
	for attribute in trace_attributes:
	# can be done here since trace attributes does not change for whole trace
	A[attribute] = trace.attributes[attribute]
	j = 0
	# j is a pointer which points to the current event inside a trace
	for transition in variant['activated_transitions']:
	for key, value in decision_points_names.items():
	tr_to_str = transition.label if labels else transition.name
	if tr_to_str in value:
	for element in last_k_list:
	if element != None:
	I[key].append((element.copy(), tr_to_str))
	for attri in attributes:
	# print(variant, transition.label, j)
	if attri in trace[j]:
	# only add the attribute information if it is present in the event
	A[attri] = trace[j][attri]
	# add A to last_k_list. Using modulo to access correct entry
	last_k_list[j % k] = A.copy()
	if transition.label != None:
	if not j + 1 >= len(trace):
	# Problem otherwise: If there are tau-transition after the last event related transition,
	# the pointer j which points to the current event in a trace, gets out of range
	j += 1
	else:
	example_trace = log[variants_idxs[one_variant[count]][0]]
	align_parameters = copy(parameters)
	align_parameters[star.Parameters.PARAM_ALIGNMENT_RESULT_IS_SYNC_PROD_AWARE] = True
	alignment = ali.apply(example_trace, net, initial_marking, final_marking,
	parameters=align_parameters)['alignment']
	for trace_index in variants_idxs[one_variant[count]]:
	last_k_list = [None] * k
	trace = log[trace_index]
	if use_trace_attributes:
	for attribute in trace_attributes:
	# can be done here since trace attributes does not change for whole trace
	A[attribute] = trace.attributes[attribute]
	j = 0
	for el in alignment:
	if el[1][1] != '>>':
	# If move in model
	for key, value in decision_points.items():
	if el[0][1] in value:
	for element in last_k_list:
	if element != None:
	# only add those entries where information is provided
	if el[1][1] == None:
	# for some dt algorithms, the entry None might be a problem, since it is left out later
	I[key].append((element.copy(), el[0][1]))
	else:
	I[key].append((element.copy(), el[1][1]))
	if el[1][0] != '>>' and el[1][1] != '>>':
	# If there is a move in log and model
	for attri in attributes:
	if attri in trace[j]:
	# only add the attribute information if it is present in the event
	A[attri] = trace[j][attri]
	# add A to last_k_list. Using modulo to access correct entry
	last_k_list[j % k] = A.copy()
	if el[1][0] != '>>':
	# only go to next event in trace if the current event has been aligned
	# TODO: Discuss if this is correct or can lead to problems
	j += 1
	count += 1
	return I


	def encode_target(df, target_column):
	"""Add column to df with integers for the target.
	Method taken from: http://chrisstrelioff.ws/sandbox/2015/06/08/decision_trees_in_python_with_scikit_learn_and_pandas.html
	Args
	----
	df -- pandas DataFrame.
	target_column -- column to map to int, producing
	new Target column.

	Returns
	-------
	df_mod -- modified DataFrame.
	targets -- list of target names.
	"""
	df_mod = df.copy()
	targets = pandas_utils.format_unique(df_mod[target_column].unique())
	map_to_int = {name: n for n, name in enumerate(targets)}
	df_mod["Target"] = df_mod[target_column].replace(map_to_int)

	return (df_mod, targets)