kota
initial commit
e60e568
raw
history blame
15.8 kB
'''
This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).
PM4Py is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
PM4Py is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PM4Py. If not, see <https://www.gnu.org/licenses/>.
'''
__doc__ = """
The ``pm4py.ml`` module contains the machine learning features offered in ``pm4py``
"""
from typing import Union, Tuple, Any, List, Collection, Optional
import pandas as pd
import numpy as np
from pm4py.objects.ocel.obj import OCEL
from pm4py.objects.log.obj import EventLog, EventStream
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.utils import __event_log_deprecation_warning
import random
from pm4py.util.pandas_utils import check_is_pandas_dataframe, check_pandas_dataframe_columns
from pm4py.utils import get_properties, constants, pandas_utils
def split_train_test(log: Union[EventLog, pd.DataFrame], train_percentage: float = 0.8, case_id_key="case:concept:name") -> Union[
Tuple[EventLog, EventLog], Tuple[pd.DataFrame, pd.DataFrame]]:
"""
Split an event log in a training log and a test log (for machine learning purposes).
Returns the training and the test event log.
:param log: event log / Pandas dataframe
:param train_percentage: fraction of traces to be included in the training log (from 0.0 to 1.0)
:param case_id_key: attribute to be used as case identifier
:rtype: ``Union[Tuple[EventLog, EventLog], Tuple[pd.DataFrame, pd.DataFrame]]``
.. code-block:: python3
import pm4py
train_df, test_df = pm4py.split_train_test(dataframe, train_percentage=0.75)
"""
__event_log_deprecation_warning(log)
if check_is_pandas_dataframe(log):
check_pandas_dataframe_columns(log)
cases = pandas_utils.format_unique(log[case_id_key].unique())
train_cases = set()
test_cases = set()
for c in cases:
r = random.random()
if r <= train_percentage:
train_cases.add(c)
else:
test_cases.add(c)
train_df = log[log[case_id_key].isin(train_cases)]
test_df = log[log[case_id_key].isin(test_cases)]
return train_df, test_df
else:
from pm4py.objects.log.util import split_train_test
return split_train_test.split(log, train_percentage=train_percentage)
def get_prefixes_from_log(log: Union[EventLog, pd.DataFrame], length: int, case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]:
"""
Gets the prefixes of a log of a given length. The returned log object contain the prefixes:
- if a trace has lower or identical length, it is included as-is
- if a trace has greater length, it is cut
:param log: event log / Pandas dataframe
:param length: length
:param case_id_key: attribute to be used as case identifier
:rtype: ``Union[EventLog, pd.DataFrame]``
.. code-block:: python3
import pm4py
trimmed_df = pm4py.get_prefixes_from_log(dataframe, length=5, case_id_key='case:concept:name')
"""
__event_log_deprecation_warning(log)
if check_is_pandas_dataframe(log):
check_pandas_dataframe_columns(log, case_id_key=case_id_key)
from pm4py.util import pandas_utils
log = pandas_utils.insert_ev_in_tr_index(log, case_id=case_id_key)
return log[log[constants.DEFAULT_INDEX_IN_TRACE_KEY] <= (length-1)]
else:
from pm4py.objects.log.util import get_prefixes
return get_prefixes.get_prefixes_from_log(log, length)
def extract_outcome_enriched_dataframe(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name",
timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name",
start_timestamp_key: str = "time:timestamp") -> pd.DataFrame:
"""
Inserts additional columns in the dataframe which are computed on the overall case, so they model the
outcome of the case.
:param log: event log / Pandas dataframe
:param activity_key: attribute to be used for the activity
:param timestamp_key: attribute to be used for the timestamp
:param case_id_key: attribute to be used as case identifier
:param start_timestamp_key: attribute to be used as start timestamp
:rtype: ``pd.DataFrame``
.. code-block:: python3
import pm4py
enriched_df = pm4py.extract_outcome_enriched_dataframe(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', start_timestamp_key='time:timestamp')
"""
__event_log_deprecation_warning(log)
properties = get_properties(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key)
log = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME, parameters=properties)
from pm4py.util import pandas_utils
fea_df = extract_features_dataframe(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key, include_case_id=True)
log2 = pandas_utils.insert_case_arrival_finish_rate(log.copy(), timestamp_column=timestamp_key, case_id_column=case_id_key, start_timestamp_column=start_timestamp_key)
log2 = pandas_utils.insert_case_service_waiting_time(log2.copy(), timestamp_column=timestamp_key, case_id_column=case_id_key, start_timestamp_column=start_timestamp_key)
return log2.merge(fea_df, left_on=case_id_key, right_on=case_id_key)
def extract_features_dataframe(log: Union[EventLog, pd.DataFrame], str_tr_attr=None, num_tr_attr=None, str_ev_attr=None, num_ev_attr=None, str_evsucc_attr=None, activity_key="concept:name", timestamp_key="time:timestamp", case_id_key=None, resource_key="org:resource", include_case_id: bool = False, **kwargs) -> pd.DataFrame:
"""
Extracts a dataframe containing the features of each case of the provided log object
:param log: log object (event log / Pandas dataframe)
:param str_tr_attr: (if provided) string attributes at the case level which should be extracted as features
:param num_tr_attr: (if provided) numeric attributes at the case level which should be extracted as features
:param str_ev_attr: (if provided) string attributes at the event level which should be extracted as features (one-hot encoding)
:param num_ev_attr: (if provided) numeric attributes at the event level which should be extracted as features (last value per attribute in a case)
:param activity_key: the attribute to be used as activity
:param timestamp_key: the attribute to be used as timestamp
:param case_id_key: (if provided, otherwise default) the attribute to be used as case identifier
:param resource_key: the attribute to be used as resource
:param include_case_id: includes the case identifier column in the features table
:rtype: ``pd.DataFrame``
.. code-block:: python3
import pm4py
features_df = pm4py.extract_features_dataframe(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp')
"""
__event_log_deprecation_warning(log)
parameters = {}
if kwargs is not None:
parameters = kwargs
properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key)
for prop in properties:
parameters[prop] = properties[prop]
parameters["str_tr_attr"] = str_tr_attr
parameters["num_tr_attr"] = num_tr_attr
parameters["str_ev_attr"] = str_ev_attr
parameters["num_ev_attr"] = num_ev_attr
parameters["str_evsucc_attr"] = str_evsucc_attr
parameters["add_case_identifier_column"] = include_case_id
from pm4py.algo.transformation.log_to_features import algorithm as log_to_features
if check_is_pandas_dataframe(log):
check_pandas_dataframe_columns(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key)
data, feature_names = log_to_features.apply(log, parameters=parameters)
return pandas_utils.instantiate_dataframe(data, columns=feature_names)
def extract_ocel_features(ocel: OCEL, obj_type: str, enable_object_lifecycle_paths: bool = True, enable_object_work_in_progress: bool = False, object_str_attributes: Optional[Collection[str]] = None, object_num_attributes: Optional[Collection[str]] = None, include_obj_id: bool = False, debug: bool = False) -> pd.DataFrame:
"""
Extracts from an object-centric event log a set of features (returned as dataframe) computed on the OCEL
for the objects of a given object type.
Implements the approach described in:
Berti, A., Herforth, J., Qafari, M.S. et al. Graph-based feature extraction on object-centric event logs. Int J Data Sci Anal (2023). https://doi.org/10.1007/s41060-023-00428-2
:param ocel: object-centric event log
:param obj_type: object type that should be considered
:param enable_object_lifecycle_paths: enables the "lifecycle paths" feature
:param enable_object_work_in_progress: enables the "work in progress" feature (which has an high computational cost)
:param object_str_attributes: string attributes at the object level to one-hot encode during the feature extraction
:param object_num_attributes: numeric attributes at the object level to one-hot encode during the feature extraction
:param include_obj_id: includes the object identifier as column of the "features" dataframe
:param debug: enables debugging mode (telling at which point of the feature extraction you are)
:rtype: ``pd.DataFrame``
.. code-block:: python3
import pm4py
ocel = pm4py.read_ocel('log.jsonocel')
fea_df = pm4py.extract_ocel_features(ocel, "item")
"""
if object_str_attributes is None:
object_str_attributes = []
if object_num_attributes is None:
object_num_attributes = []
parameters = {}
parameters["filter_per_type"] = obj_type
parameters["enable_object_lifecycle_paths"] = enable_object_lifecycle_paths
parameters["enable_object_work_in_progress"] = enable_object_work_in_progress
parameters["enable_object_str_attributes"] = len(object_str_attributes) > 0
parameters["enable_object_num_attributes"] = len(object_num_attributes) > 0
parameters["str_obj_attr"] = object_str_attributes
parameters["num_obj_attr"] = object_num_attributes
parameters["debug"] = debug
from pm4py.algo.transformation.ocel.features.objects import algorithm as ocel_feature_extraction
data, feature_names = ocel_feature_extraction.apply(ocel, parameters=parameters)
dataframe = pandas_utils.instantiate_dataframe(data, columns=feature_names)
dataframe = dataframe.dropna(how="any", axis=1)
dataframe = dataframe.select_dtypes(include=np.number)
if include_obj_id:
objects_with_type = ocel.objects[[ocel.object_id_column, ocel.object_type_column]].to_dict("records")
objects_with_type = [x[ocel.object_id_column] for x in objects_with_type if x[ocel.object_type_column] == obj_type]
dataframe[ocel.object_id_column] = objects_with_type
return dataframe
def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grouper_freq="W", activity_key="concept:name", timestamp_key="time:timestamp", case_id_key=None, start_timestamp_key="time:timestamp", resource_key="org:resource") -> pd.DataFrame:
"""
Extracts a dataframe containing the temporal features of the provided log object
Implements the approach described in the paper:
Pourbafrani, Mahsa, Sebastiaan J. van Zelst, and Wil MP van der Aalst. "Supporting automatic system dynamics model generation for simulation in the context of process mining." International Conference on Business Information Systems. Springer, Cham, 2020.
:param log: log object (event log / Pandas dataframe)
:param grouper_freq: the grouping frequency (D, W, M, Y) to use
:param activity_key: the attribute to be used as activity
:param timestamp_key: the attribute to be used as timestamp
:param case_id_key: (if provided, otherwise default) the attribute to be used as case identifier
:param resource_key: the attribute to be used as resource
:param start_timestamp_key: the attribute to be used as start timestamp
:rtype: ``pd.DataFrame``
.. code-block:: python3
import pm4py
temporal_features_df = pm4py.extract_temporal_features_dataframe(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp')
"""
__event_log_deprecation_warning(log)
parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key)
from pm4py.algo.transformation.log_to_features.variants import temporal
parameters[temporal.Parameters.GROUPER_FREQ] = grouper_freq
parameters[temporal.Parameters.ACTIVITY_COLUMN] = activity_key
parameters[temporal.Parameters.TIMESTAMP_COLUMN] = timestamp_key
if case_id_key is not None:
parameters[temporal.Parameters.CASE_ID_COLUMN] = case_id_key
parameters[temporal.Parameters.START_TIMESTAMP_COLUMN] = start_timestamp_key
parameters[temporal.Parameters.RESOURCE_COLUMN] = resource_key
return temporal.apply(log, parameters=parameters)
def extract_target_vector(log: Union[EventLog, pd.DataFrame], variant: str, activity_key="concept:name", timestamp_key="time:timestamp", case_id_key="case:concept:name") -> Tuple[Any, List[str]]:
"""
Extracts from a log object the target vector for a specific ML use case
(next activity, next time, remaining time)
:param log: log object (event log / Pandas dataframe)
:param variant: variant of the algorithm to be used: next_activity, next_time, remaining_time
:param activity_key: the attribute to be used as activity
:param timestamp_key: the attribute to be used as timestamp
:param case_id_key: the attribute to be used as case identifier
:rtype: ``Tuple[Any, List[str]]``
.. code-block:: python3
import pm4py
vector_next_act, class_next_act = pm4py.extract_target_vector(log, 'next_activity', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name')
vector_next_time, class_next_time = pm4py.extract_target_vector(log, 'next_time', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name')
vector_rem_time, class_rem_time = pm4py.extract_target_vector(log, 'remaining_time', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name')
"""
__event_log_deprecation_warning(log)
parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key)
from pm4py.algo.transformation.log_to_target import algorithm as log_to_target
var_map = {"next_activity": log_to_target.Variants.NEXT_ACTIVITY, "next_time": log_to_target.Variants.NEXT_TIME,
"remaining_time": log_to_target.Variants.REMAINING_TIME}
if variant not in var_map:
raise Exception(
"please provide the variant between: next_activity, next_time, remaining_time")
target, classes = log_to_target.apply(log, variant=var_map[variant], parameters=parameters)
return target, classes