File size: 23,007 Bytes
e60e568
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
'''
    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

    PM4Py is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    PM4Py is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
'''
import sys
from copy import deepcopy, copy
from enum import Enum
from typing import Optional, Dict, Any, Union, Tuple
import numpy as np

from pm4py.algo.conformance.alignments.petri_net import algorithm as ali
from pm4py.algo.conformance.alignments.petri_net.variants import state_equation_a_star as star
from pm4py.algo.conformance.tokenreplay import algorithm as token_replay
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.obj import EventLog
from pm4py.objects.petri_net import properties as petri_properties
from pm4py.objects.petri_net.obj import PetriNet, Marking
from pm4py.statistics.attributes.log.select import select_attributes_from_log_for_tree
from pm4py.statistics.variants.log import get as variants_module
from pm4py.util import constants, xes_constants
from pm4py.util import exec_utils, pandas_utils
from pm4py.visualization.decisiontree.util import dt_to_string
import pandas as pd


class Parameters(Enum):
    ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
    LABELS = "labels"


def create_data_petri_nets_with_decisions(log: Union[EventLog, pd.DataFrame], net: PetriNet, initial_marking: Marking,
                                          final_marking: Marking) -> Tuple[PetriNet, Marking, Marking]:
    """
    Given a Petri net, create a data Petri net with the decisions given for each place by the decision
    mining algorithm

    Parameters
    ----------------
    log
        Event log
    net
        Petri net
    initial_marking
        Initial marking
    final_marking
        Final marking

    Returns
    ------------------
    data_petri_net
        Data petri net
    initial_marking
        Initial marking (unchanged)
    final_marking
        Final marking (unchanged)
    """
    all_conditions = {}
    all_variables = {}
    for place in net.places:
        try:
            clf, columns, targets = get_decision_tree(log, net, initial_marking, final_marking,
                                                      decision_point=place.name,
                                                      parameters={"labels": False})
            target_classes, variables = dt_to_string.apply(clf, columns)
            target_classes = {targets[int(k)]: v for k, v in target_classes.items()}
            variables = {targets[int(k)]: v for k, v in variables.items()}
            for k in target_classes.keys():
                all_conditions[k] = target_classes[k]
                all_variables[k] = variables[k]
        except:
            pass
    for trans in net.transitions:
        if trans.name in all_conditions:
            trans.properties[petri_properties.TRANS_GUARD] = all_conditions[trans.name]
            trans.properties[petri_properties.READ_VARIABLE] = all_variables[trans.name]
            trans.properties[petri_properties.WRITE_VARIABLE] = []
    return net, initial_marking, final_marking


def get_decision_tree(log: Union[EventLog, pd.DataFrame], net: PetriNet, initial_marking: Marking, final_marking: Marking,
                      decision_point=None, attributes=None,
                      parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> Any:
    """
    Gets a decision tree classifier on a specific point of the model

    Parameters
    --------------
    log
        Event log
    net
        Petri net
    initial_marking
        Initial marking
    final_marking
        Final marking
    decision_point
        Point of the process in which a decision happens:
        - if not specified, the method crashes, but provides a list of possible decision points
        - if specified, the method goes on and produce the decision tree
    attributes
        Attributes of the log. If not specified, then an automatic attribute selection
        is performed.
    parameters
        Parameters of the algorithm

    Returns
    ---------------
    clf
        Decision tree
    feature_names
        The names of the features
    classes
        The classes
    """
    from pm4py.util import ml_utils

    if parameters is None:
        parameters = {}

    X, y, targets = apply(log, net, initial_marking, final_marking, decision_point=decision_point,
                          attributes=attributes, parameters=parameters)

    dt = ml_utils.DecisionTreeClassifier()
    dt = dt.fit(X, y)
    return dt, list(X.columns.values.tolist()), targets


def apply(log: Union[EventLog, pd.DataFrame], net: PetriNet, initial_marking: Marking, final_marking: Marking, decision_point=None,
          attributes=None, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> Any:
    """
    Gets the essential information (features, target class and names of the target class)
    in order to learn a classifier

    Parameters
    --------------
    log
        Event log
    net
        Petri net
    initial_marking
        Initial marking
    final_marking
        Final marking
    decision_point
        Point of the process in which a decision happens:
        - if not specified, the method crashes, but provides a list of possible decision points
        - if specified, the method goes on and produce the decision tree
    attributes
        Attributes of the log. If not specified, then an automatic attribute selection
        is performed.
    parameters
        Parameters of the algorithm

    Returns
    ---------------
    X
        features
    y
        Target class
    class_name
        Target class names
    """
    import pandas as pd

    if parameters is None:
        parameters = {}

    labels = exec_utils.get_param_value(Parameters.LABELS, parameters, True)

    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
    if decision_point is None:
        decision_points_names = get_decision_points(net, labels=labels, parameters=parameters)
        raise Exception("please provide decision_point as argument of the method. Possible decision points: ",
                        decision_points_names)
    if attributes is None:
        str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr = select_attributes_from_log_for_tree(log)
        attributes = list(str_ev_attr) + list(num_ev_attr)
    I, dp = get_decisions_table(log, net, initial_marking, final_marking, attributes=attributes,
                                pre_decision_points=[decision_point], parameters=parameters)
    x_attributes = [a for a in attributes if not a == activity_key]
    str_attributes = set()
    non_str_attributes = set()
    x = []
    x2 = []
    y = []
    for el in I[decision_point]:
        for a, v in el[0].items():
            if a in x_attributes:
                if type(v) is str:
                    str_attributes.add(a)
                else:
                    non_str_attributes.add(a)
        x.append({a: v for a, v in el[0].items() if a in x_attributes and type(v) is str})
        x2.append({a: v for a, v in el[0].items() if a in x_attributes and type(v) is not str})
        y.append(el[1])
    X = pandas_utils.instantiate_dataframe(x)
    X = pd.get_dummies(data=X, columns=list(str_attributes))
    X2 = pandas_utils.instantiate_dataframe(x2)
    X = pandas_utils.concat([X, X2], axis=1)
    Y = pandas_utils.instantiate_dataframe(y, columns=["Name"])
    Y, targets = encode_target(Y, "Name")
    y = Y['Target']
    return X, y, targets


def get_decisions_table(log0, net, initial_marking, final_marking, attributes=None, use_trace_attributes=False, k=1,
                        pre_decision_points=None, trace_attributes=None, parameters=None):
    """
    Gets a decision table out of a log and an accepting Petri net

    Parameters
    -----------------
    log0
        Event log
    net
        Petri net
    initial_marking
        Initial marking
    final_marking
        Final marking
    attributes
        List of attributes which are considered
        (if not provided, all the attributes are considered)
    use_trace_attributes
        Include trace attributes in the decision table
    k
        Number that determines the number of last activities to take into account
    pre_decision_points
        List of Strings of place Names that have to be considered as decision points.
        If not provided, the decision points are inferred from the Petri net
    trace_attributes
        List of trace attributes to consider
    parameters
        Possible parameters of the algorithm

    Returns
    --------------
    I
        decision table
    decision_points
        The decision points as places of the Petri net, which are the keys of a dictionary
        having as values the list of transitions that are target
    """
    if parameters is None:
        parameters = {}

    labels = exec_utils.get_param_value(Parameters.LABELS, parameters, True)

    log = deepcopy(log0)
    log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters)

    if pre_decision_points != None:
        if not isinstance(pre_decision_points, list):
            print(
                "Error: The parameter pre_decision_points has to be a list of names of the places that have to be considered.")
            sys.exit()
        if len(pre_decision_points) == 0:
            print("Error: There must be at least one element in the list of pre_decision_points.")
            sys.exit()
    if attributes != None:
        if not isinstance(attributes, list):
            print(
                "Error: The parameter attributes has to be a list of names of event attributes that have to be considered.")
            sys.exit()
        if len(attributes) == 0:
            print("Error: There must be at least one element in the list of attributes.")
            sys.exit()
    if use_trace_attributes == False and trace_attributes != None and isinstance(trace_attributes, list):
        print(
            "Note: Since a list of considerable trace attributes is provided, and use_trace_attributes was set on False, we set it on True")
        use_trace_attributes = True
    if trace_attributes != None:
        if not isinstance(trace_attributes, list):
            print(
                "Error: The parameter trace_attributes has to be a list of names of trace attributes that have to be considered.")
            sys.exit()
        if len(trace_attributes) == 0:
            print("Error: There must be at least one element in the list of trace_attributes.")
            sys.exit()

    # alignment = ali.apply(log, net, initial_marking, final_marking, variant=True, parameters={star.PARAM_ALIGNMENT_RESULT_IS_SYNC_PROD_AWARE:True})
    decision_points = get_decision_points(net, pre_decision_points=pre_decision_points, parameters=parameters)
    decision_points_names = get_decision_points(net, labels=labels, pre_decision_points=pre_decision_points,
                                                parameters=parameters)
    if use_trace_attributes:
        # Made to ensure distinguishness between event and trace attributes.
        log = prepare_event_log(log)
        if attributes != None:
            attributes = prepare_attributes(attributes)

    if use_trace_attributes and trace_attributes == None:
        # if no list of trace attributes is provided, we create one
        trace_attributes = []
        if use_trace_attributes:
            for trace in log:
                trace_attributes += list(trace.attributes)
        trace_attributes = list(set(trace_attributes))
    if attributes == None:
        # if no list is given, every attribute of the events are considered
        attributes = []
        for trace in log:
            for event in trace:
                attributes += list(event.keys())
    attributes = list(set(attributes))
    I = get_attributes(log, decision_points,
                       attributes, use_trace_attributes, trace_attributes,
                       k, net, initial_marking, final_marking, decision_points_names, parameters=parameters)
    return (I, decision_points)


def prepare_event_log(log):
    """
    If trace attributes are considered, it is possible that trace attributes have the same name as event attributes.
    To tackle this issue, the attributes get renamed.
    For trace attributes, we add "t_" at the beginning of the dictionary keys.
    For event attributes, we add "e_" at the beginning of the dict keys.
    :param log:
    :return:
    """
    for trace in log:
        attributes = trace.attributes.copy()
        for attribute in attributes:
            trace.attributes["t_" + attribute] = trace.attributes.pop(attribute)
        for event in trace:
            attributes = event._dict.copy()
            for attribute in attributes:
                event._dict["e_" + attribute] = event._dict.pop(attribute)
    return log


def prepare_attributes(attributes):
    """
    Method that "e_" in front of every attribute if trace attributes are considered.
    :param attributes: List of event attributes that the user wants to consider.
    :return: list of edited attribute names
    """
    new_attributes = []
    for attribute in attributes:
        new_attributes.append("e_" + attribute)
    return new_attributes


def get_decision_points(net, labels=False, pre_decision_points=None, parameters=None):
    """
    The goal is to get all decision places. These are places where there are at least two outgoing arcs.
    :param net: Petri Net where decision points are discovered (places with at least two outgoing arcs)
    :param labels: If someone wants to get the labels of the transitions after a decision point and not the "ID"
    :return:
    """
    if parameters is None:
        parameters = {}
    counter = {}
    for place in net.places:
        counter[place.name] = []
    for arc in net.arcs:
        if arc.source in net.places:
            if labels == True:
                counter[arc.source.name].append(arc.target.label)
            else:
                counter[arc.source.name].append(arc.target.name)
    decision_points = {key: val for key, val in counter.items() if len(val) >= 2}
    i = 0
    # i counts how many given decision points of the user are detected
    if pre_decision_points != None:
        for el in list(decision_points):
            if el in pre_decision_points:
                i += 1
            else:
                del decision_points[el]
        if i == len(pre_decision_points):
            # print("All given decision points were identified as decision points in the Petri Net.")
            pass
        elif i == 0:
            raise Exception("None of the given points is a decision point.")
            # sys.exit()
        else:
            print(
                "Not all of the given places were identified as decision points. However, we only take the correct decision points from your list into account.")
    return decision_points


def simplify_token_replay(replay):
    variant = {}
    for element in replay:
        if tuple(element['activated_transitions']) not in variant:
            variant[tuple(element['activated_transitions'])] = True
    smaller_replay = []
    for element in replay:
        if variant[tuple(element['activated_transitions'])]:
            smaller_replay.append(element)
            variant[tuple(element['activated_transitions'])] = False
    return smaller_replay


def get_attributes(log, decision_points, attributes, use_trace_attributes, trace_attributes, k, net, initial_marking,
                   final_marking, decision_points_names, parameters=None):
    """
    This method aims to construct for each decision place a table where for each decision place a list if given with the
     label of the later decision and as value the given attributes
    :param log: Log on which the method is applied
    :param alignments: Computed alignments for a log and a model
    :param decision_points: Places that have multiple outgoing arcs
    :param attributes: Attributes that are considered
    :param use_trace_attributes: If trace attributes have to be considered or not
    :param trace_attributes: List of trace attributes that are considered
    :param k: Taking k last activities into account
    :return: Dictionary that has as keys the decision places. The value for this key is a list.
    The content of these lists are tuples. The first element of these tuples is information regrading the attributes,
    the second element of these tuples is the transition which chosen in a decision.
    """
    if parameters is None:
        parameters = {}
    labels = exec_utils.get_param_value(Parameters.LABELS, parameters, True)

    I = {}
    for key in decision_points:
        I[key] = []
    A = {}
    for attri in attributes:
        A[attri] = None
    i = 0
    # first, take a look at the variants
    variants_idxs = variants_module.get_variants_from_log_trace_idx(log, parameters=parameters)
    one_variant = []
    for variant in variants_idxs:
        one_variant.append(variant)
        # TODO: Token based replay code mit paramter für nur varianten einbeziehen ausstatten
    replay_result = token_replay.apply(log, net, initial_marking, final_marking, parameters=parameters)
    replay_result = simplify_token_replay(replay_result)
    count = 0
    for variant in replay_result:
        if variant['trace_fitness'] == 1.0:
            for trace_index in variants_idxs[one_variant[count]]:
                last_k_list = [None] * k
                trace = log[trace_index]
                if use_trace_attributes:
                    for attribute in trace_attributes:
                        # can be done here since trace attributes does not change for whole trace
                        A[attribute] = trace.attributes[attribute]
                j = 0
                # j is a pointer which points to the current event inside a trace
                for transition in variant['activated_transitions']:
                    for key, value in decision_points_names.items():
                        tr_to_str = transition.label if labels else transition.name
                        if tr_to_str in value:
                            for element in last_k_list:
                                if element != None:
                                    I[key].append((element.copy(), tr_to_str))
                    for attri in attributes:
                        # print(variant, transition.label, j)
                        if attri in trace[j]:
                            # only add the attribute information if it is present in the event
                            A[attri] = trace[j][attri]
                    # add A to last_k_list. Using modulo to access correct entry
                    last_k_list[j % k] = A.copy()
                    if transition.label != None:
                        if not j + 1 >= len(trace):
                            # Problem otherwise: If there are tau-transition after the last event related transition,
                            # the pointer j which points to the current event in a trace, gets out of range
                            j += 1
        else:
            example_trace = log[variants_idxs[one_variant[count]][0]]
            align_parameters = copy(parameters)
            align_parameters[star.Parameters.PARAM_ALIGNMENT_RESULT_IS_SYNC_PROD_AWARE] = True
            alignment = ali.apply(example_trace, net, initial_marking, final_marking,
                                  parameters=align_parameters)['alignment']
            for trace_index in variants_idxs[one_variant[count]]:
                last_k_list = [None] * k
                trace = log[trace_index]
                if use_trace_attributes:
                    for attribute in trace_attributes:
                        # can be done here since trace attributes does not change for whole trace
                        A[attribute] = trace.attributes[attribute]
                j = 0
                for el in alignment:
                    if el[1][1] != '>>':
                        # If move in model
                        for key, value in decision_points.items():
                            if el[0][1] in value:
                                for element in last_k_list:
                                    if element != None:
                                        # only add those entries where information is provided
                                        if el[1][1] == None:
                                            # for some dt algorithms, the entry None might be a problem, since it is left out later
                                            I[key].append((element.copy(), el[0][1]))
                                        else:
                                            I[key].append((element.copy(), el[1][1]))
                    if el[1][0] != '>>' and el[1][1] != '>>':
                        # If there is a move in log and model
                        for attri in attributes:
                            if attri in trace[j]:
                                # only add the attribute information if it is present in the event
                                A[attri] = trace[j][attri]
                        # add A to last_k_list. Using modulo to access correct entry
                        last_k_list[j % k] = A.copy()
                    if el[1][0] != '>>':
                        # only go to next event in trace if the current event has been aligned
                        # TODO: Discuss if this is correct or can lead to problems
                        j += 1
        count += 1
    return I


def encode_target(df, target_column):
    """Add column to df with integers for the target.
    Method taken from: http://chrisstrelioff.ws/sandbox/2015/06/08/decision_trees_in_python_with_scikit_learn_and_pandas.html
    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = pandas_utils.format_unique(df_mod[target_column].unique())
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)