kota
initial commit
e60e568
raw
history blame
7.49 kB
'''
This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).
PM4Py is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
PM4Py is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PM4Py. If not, see <https://www.gnu.org/licenses/>.
'''
import importlib.util
import warnings
from enum import Enum
from typing import Optional, Dict, Any, Union
import numpy as np
from pm4py.algo.anonymization.trace_variant_query.util.util import generate_pm4py_log
from pm4py.objects.log.obj import EventLog
from pm4py.util import exec_utils
TRACE_START = "TRACE_START"
TRACE_END = "TRACE_END"
EVENT_DELIMETER = ">>>"
class Parameters(Enum):
EPSILON = "epsilon"
K = "k"
P = "p"
SHOW_PROGRESS_BAR = "show_progress_bar"
def apply(log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> EventLog:
"""
Variant Laplace is described in:
Mannhardt, F., Koschmider, A., Baracaldo, N. et al. Privacy-Preserving Process Mining. Bus Inf Syst Eng 61,
595–614 (2019). https://doi.org/10.1007/s12599-019-00613-3
Parameters
-------------
log
Event log
parameters
Parameters of the algorithm:
-Parameters.EPSILON -> Strength of the differential privacy guarantee
-Parameters.K -> Maximum prefix length of considered traces for the trace-variant-query
-Parameters.P -> Pruning parameter of the trace-variant-query. Of a noisy trace variant, at least P traces
must appear. Otherwise, the trace variant and its traces won't be part of the result of the
trace variant query.
-Parameters.SHOW_PROGRESS_BAR -> Enables/disables the progress bar (default: True)
Returns
------------
anonymized_trace_variant_distribution
An anonymized trace variant distribution as an EventLog
"""
if parameters is None:
parameters = {}
epsilon = exec_utils.get_param_value(Parameters.EPSILON, parameters, 1)
k = exec_utils.get_param_value(Parameters.K, parameters, 0)
p = exec_utils.get_param_value(Parameters.P, parameters, 1)
if k == 0:
warnings.warn(
"k, the maximum prefix length of considered traces for the trace-variant-query, is set to 0, the trace-varaint-query will be empty.")
if p == 1:
warnings.warn("p, the pruning parameter, is set to 1, the trace-varaint-query might be very large.",
RuntimeWarning)
show_progress_bar = exec_utils.get_param_value(Parameters.SHOW_PROGRESS_BAR, parameters, True)
progress = None
if importlib.util.find_loader("tqdm") and show_progress_bar:
from tqdm.auto import tqdm
progress = tqdm(total=k, desc="prefix tree construction, completed prefixes of length :: ")
return privatize_tracevariants(log, epsilon, p, k, progress)
def privatize_tracevariants(log, epsilon, p, n, progress):
# transform log into event view and get prefix frequencies
event_int_mapping = create_event_int_mapping(log)
known_prefix_frequencies = get_prefix_frequencies_from_log(log)
events = list(event_int_mapping.keys())
events.remove(TRACE_START)
final_frequencies = {}
trace_frequencies = {"": 0}
for i in range(1, n + 1):
# get prefix_frequencies, using either known frequency, or frequency of parent, or 0
trace_frequencies = get_prefix_frequencies_length_n(trace_frequencies, events, i, known_prefix_frequencies)
# laplace_mechanism
trace_frequencies = apply_laplace_noise_tf(trace_frequencies, epsilon)
# prune
trace_frequencies = prune_trace_frequencies(trace_frequencies, p)
# print(trace_frequencies)
# add finished traces to output, remove from list, sanity checks
new_frequencies = {}
for entry in trace_frequencies.items():
if TRACE_END in entry[0]:
final_frequencies[entry[0]] = entry[1]
else:
new_frequencies[entry[0]] = entry[1]
trace_frequencies = new_frequencies
# print(trace_frequencies)
if progress is not None:
progress.update()
if progress is not None:
progress.close()
del progress
return generate_pm4py_log(final_frequencies)
def create_event_int_mapping(log):
event_name_list = []
for trace in log:
for event in trace:
event_name = event["concept:name"]
if not str(event_name) in event_name_list:
event_name_list.append(event_name)
event_int_mapping = {}
event_int_mapping[TRACE_START] = 0
current_int = 1
for event_name in event_name_list:
event_int_mapping[event_name] = current_int
current_int = current_int + 1
event_int_mapping[TRACE_END] = current_int
return event_int_mapping
def get_prefix_frequencies_from_log(log):
prefix_frequencies = {}
for trace in log:
current_prefix = ""
for event in trace:
current_prefix = current_prefix + event["concept:name"] + EVENT_DELIMETER
if current_prefix in prefix_frequencies:
prefix_frequencies[current_prefix] += 1
else:
prefix_frequencies[current_prefix] = 1
current_prefix = current_prefix + TRACE_END
if current_prefix in prefix_frequencies:
prefix_frequencies[current_prefix] += 1
else:
prefix_frequencies[current_prefix] = 1
return prefix_frequencies
def get_prefix_frequencies_length_n(trace_frequencies, events, n, known_prefix_frequencies):
prefixes_length_n = {}
for prefix, frequency in trace_frequencies.items():
for new_prefix in pref(prefix, events):
if new_prefix in known_prefix_frequencies:
new_frequency = known_prefix_frequencies[new_prefix]
prefixes_length_n[new_prefix] = new_frequency
else:
prefixes_length_n[new_prefix] = 0
return prefixes_length_n
def prune_trace_frequencies(trace_frequencies, P):
pruned_frequencies = {}
for entry in trace_frequencies.items():
if entry[1] >= P:
pruned_frequencies[entry[0]] = entry[1]
return pruned_frequencies
def pref(prefix, events):
prefixes_length_n = []
if not TRACE_END in prefix:
for event in events:
if event == TRACE_END:
current_prefix = prefix + event
else:
current_prefix = prefix + event + EVENT_DELIMETER
prefixes_length_n.append(current_prefix)
return prefixes_length_n
def apply_laplace_noise_tf(trace_frequencies, epsilon):
scale = 1 / epsilon
for trace_frequency in trace_frequencies:
noise = int(np.random.laplace(0, scale))
trace_frequencies[trace_frequency] = trace_frequencies[trace_frequency] + noise
if trace_frequencies[trace_frequency] < 0:
trace_frequencies[trace_frequency] = 0
return trace_frequencies