Spaces:
Running
Running
''' | |
This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de). | |
PM4Py is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
PM4Py is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with PM4Py. If not, see <https://www.gnu.org/licenses/>. | |
''' | |
__doc__ = """ | |
The ``pm4py.ml`` module contains the machine learning features offered in ``pm4py`` | |
""" | |
from typing import Union, Tuple, Any, List, Collection, Optional | |
import pandas as pd | |
import numpy as np | |
from pm4py.objects.ocel.obj import OCEL | |
from pm4py.objects.log.obj import EventLog, EventStream | |
from pm4py.objects.conversion.log import converter as log_converter | |
from pm4py.utils import __event_log_deprecation_warning | |
import random | |
from pm4py.util.pandas_utils import check_is_pandas_dataframe, check_pandas_dataframe_columns | |
from pm4py.utils import get_properties, constants, pandas_utils | |
def split_train_test(log: Union[EventLog, pd.DataFrame], train_percentage: float = 0.8, case_id_key="case:concept:name") -> Union[ | |
Tuple[EventLog, EventLog], Tuple[pd.DataFrame, pd.DataFrame]]: | |
""" | |
Split an event log in a training log and a test log (for machine learning purposes). | |
Returns the training and the test event log. | |
:param log: event log / Pandas dataframe | |
:param train_percentage: fraction of traces to be included in the training log (from 0.0 to 1.0) | |
:param case_id_key: attribute to be used as case identifier | |
:rtype: ``Union[Tuple[EventLog, EventLog], Tuple[pd.DataFrame, pd.DataFrame]]`` | |
.. code-block:: python3 | |
import pm4py | |
train_df, test_df = pm4py.split_train_test(dataframe, train_percentage=0.75) | |
""" | |
__event_log_deprecation_warning(log) | |
if check_is_pandas_dataframe(log): | |
check_pandas_dataframe_columns(log) | |
cases = pandas_utils.format_unique(log[case_id_key].unique()) | |
train_cases = set() | |
test_cases = set() | |
for c in cases: | |
r = random.random() | |
if r <= train_percentage: | |
train_cases.add(c) | |
else: | |
test_cases.add(c) | |
train_df = log[log[case_id_key].isin(train_cases)] | |
test_df = log[log[case_id_key].isin(test_cases)] | |
return train_df, test_df | |
else: | |
from pm4py.objects.log.util import split_train_test | |
return split_train_test.split(log, train_percentage=train_percentage) | |
def get_prefixes_from_log(log: Union[EventLog, pd.DataFrame], length: int, case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: | |
""" | |
Gets the prefixes of a log of a given length. The returned log object contain the prefixes: | |
- if a trace has lower or identical length, it is included as-is | |
- if a trace has greater length, it is cut | |
:param log: event log / Pandas dataframe | |
:param length: length | |
:param case_id_key: attribute to be used as case identifier | |
:rtype: ``Union[EventLog, pd.DataFrame]`` | |
.. code-block:: python3 | |
import pm4py | |
trimmed_df = pm4py.get_prefixes_from_log(dataframe, length=5, case_id_key='case:concept:name') | |
""" | |
__event_log_deprecation_warning(log) | |
if check_is_pandas_dataframe(log): | |
check_pandas_dataframe_columns(log, case_id_key=case_id_key) | |
from pm4py.util import pandas_utils | |
log = pandas_utils.insert_ev_in_tr_index(log, case_id=case_id_key) | |
return log[log[constants.DEFAULT_INDEX_IN_TRACE_KEY] <= (length-1)] | |
else: | |
from pm4py.objects.log.util import get_prefixes | |
return get_prefixes.get_prefixes_from_log(log, length) | |
def extract_outcome_enriched_dataframe(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", | |
timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", | |
start_timestamp_key: str = "time:timestamp") -> pd.DataFrame: | |
""" | |
Inserts additional columns in the dataframe which are computed on the overall case, so they model the | |
outcome of the case. | |
:param log: event log / Pandas dataframe | |
:param activity_key: attribute to be used for the activity | |
:param timestamp_key: attribute to be used for the timestamp | |
:param case_id_key: attribute to be used as case identifier | |
:param start_timestamp_key: attribute to be used as start timestamp | |
:rtype: ``pd.DataFrame`` | |
.. code-block:: python3 | |
import pm4py | |
enriched_df = pm4py.extract_outcome_enriched_dataframe(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', start_timestamp_key='time:timestamp') | |
""" | |
__event_log_deprecation_warning(log) | |
properties = get_properties(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key) | |
log = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME, parameters=properties) | |
from pm4py.util import pandas_utils | |
fea_df = extract_features_dataframe(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key, include_case_id=True) | |
log2 = pandas_utils.insert_case_arrival_finish_rate(log.copy(), timestamp_column=timestamp_key, case_id_column=case_id_key, start_timestamp_column=start_timestamp_key) | |
log2 = pandas_utils.insert_case_service_waiting_time(log2.copy(), timestamp_column=timestamp_key, case_id_column=case_id_key, start_timestamp_column=start_timestamp_key) | |
return log2.merge(fea_df, left_on=case_id_key, right_on=case_id_key) | |
def extract_features_dataframe(log: Union[EventLog, pd.DataFrame], str_tr_attr=None, num_tr_attr=None, str_ev_attr=None, num_ev_attr=None, str_evsucc_attr=None, activity_key="concept:name", timestamp_key="time:timestamp", case_id_key=None, resource_key="org:resource", include_case_id: bool = False, **kwargs) -> pd.DataFrame: | |
""" | |
Extracts a dataframe containing the features of each case of the provided log object | |
:param log: log object (event log / Pandas dataframe) | |
:param str_tr_attr: (if provided) string attributes at the case level which should be extracted as features | |
:param num_tr_attr: (if provided) numeric attributes at the case level which should be extracted as features | |
:param str_ev_attr: (if provided) string attributes at the event level which should be extracted as features (one-hot encoding) | |
:param num_ev_attr: (if provided) numeric attributes at the event level which should be extracted as features (last value per attribute in a case) | |
:param activity_key: the attribute to be used as activity | |
:param timestamp_key: the attribute to be used as timestamp | |
:param case_id_key: (if provided, otherwise default) the attribute to be used as case identifier | |
:param resource_key: the attribute to be used as resource | |
:param include_case_id: includes the case identifier column in the features table | |
:rtype: ``pd.DataFrame`` | |
.. code-block:: python3 | |
import pm4py | |
features_df = pm4py.extract_features_dataframe(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') | |
""" | |
__event_log_deprecation_warning(log) | |
parameters = {} | |
if kwargs is not None: | |
parameters = kwargs | |
properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) | |
for prop in properties: | |
parameters[prop] = properties[prop] | |
parameters["str_tr_attr"] = str_tr_attr | |
parameters["num_tr_attr"] = num_tr_attr | |
parameters["str_ev_attr"] = str_ev_attr | |
parameters["num_ev_attr"] = num_ev_attr | |
parameters["str_evsucc_attr"] = str_evsucc_attr | |
parameters["add_case_identifier_column"] = include_case_id | |
from pm4py.algo.transformation.log_to_features import algorithm as log_to_features | |
if check_is_pandas_dataframe(log): | |
check_pandas_dataframe_columns(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key) | |
data, feature_names = log_to_features.apply(log, parameters=parameters) | |
return pandas_utils.instantiate_dataframe(data, columns=feature_names) | |
def extract_ocel_features(ocel: OCEL, obj_type: str, enable_object_lifecycle_paths: bool = True, enable_object_work_in_progress: bool = False, object_str_attributes: Optional[Collection[str]] = None, object_num_attributes: Optional[Collection[str]] = None, include_obj_id: bool = False, debug: bool = False) -> pd.DataFrame: | |
""" | |
Extracts from an object-centric event log a set of features (returned as dataframe) computed on the OCEL | |
for the objects of a given object type. | |
Implements the approach described in: | |
Berti, A., Herforth, J., Qafari, M.S. et al. Graph-based feature extraction on object-centric event logs. Int J Data Sci Anal (2023). https://doi.org/10.1007/s41060-023-00428-2 | |
:param ocel: object-centric event log | |
:param obj_type: object type that should be considered | |
:param enable_object_lifecycle_paths: enables the "lifecycle paths" feature | |
:param enable_object_work_in_progress: enables the "work in progress" feature (which has an high computational cost) | |
:param object_str_attributes: string attributes at the object level to one-hot encode during the feature extraction | |
:param object_num_attributes: numeric attributes at the object level to one-hot encode during the feature extraction | |
:param include_obj_id: includes the object identifier as column of the "features" dataframe | |
:param debug: enables debugging mode (telling at which point of the feature extraction you are) | |
:rtype: ``pd.DataFrame`` | |
.. code-block:: python3 | |
import pm4py | |
ocel = pm4py.read_ocel('log.jsonocel') | |
fea_df = pm4py.extract_ocel_features(ocel, "item") | |
""" | |
if object_str_attributes is None: | |
object_str_attributes = [] | |
if object_num_attributes is None: | |
object_num_attributes = [] | |
parameters = {} | |
parameters["filter_per_type"] = obj_type | |
parameters["enable_object_lifecycle_paths"] = enable_object_lifecycle_paths | |
parameters["enable_object_work_in_progress"] = enable_object_work_in_progress | |
parameters["enable_object_str_attributes"] = len(object_str_attributes) > 0 | |
parameters["enable_object_num_attributes"] = len(object_num_attributes) > 0 | |
parameters["str_obj_attr"] = object_str_attributes | |
parameters["num_obj_attr"] = object_num_attributes | |
parameters["debug"] = debug | |
from pm4py.algo.transformation.ocel.features.objects import algorithm as ocel_feature_extraction | |
data, feature_names = ocel_feature_extraction.apply(ocel, parameters=parameters) | |
dataframe = pandas_utils.instantiate_dataframe(data, columns=feature_names) | |
dataframe = dataframe.dropna(how="any", axis=1) | |
dataframe = dataframe.select_dtypes(include=np.number) | |
if include_obj_id: | |
objects_with_type = ocel.objects[[ocel.object_id_column, ocel.object_type_column]].to_dict("records") | |
objects_with_type = [x[ocel.object_id_column] for x in objects_with_type if x[ocel.object_type_column] == obj_type] | |
dataframe[ocel.object_id_column] = objects_with_type | |
return dataframe | |
def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grouper_freq="W", activity_key="concept:name", timestamp_key="time:timestamp", case_id_key=None, start_timestamp_key="time:timestamp", resource_key="org:resource") -> pd.DataFrame: | |
""" | |
Extracts a dataframe containing the temporal features of the provided log object | |
Implements the approach described in the paper: | |
Pourbafrani, Mahsa, Sebastiaan J. van Zelst, and Wil MP van der Aalst. "Supporting automatic system dynamics model generation for simulation in the context of process mining." International Conference on Business Information Systems. Springer, Cham, 2020. | |
:param log: log object (event log / Pandas dataframe) | |
:param grouper_freq: the grouping frequency (D, W, M, Y) to use | |
:param activity_key: the attribute to be used as activity | |
:param timestamp_key: the attribute to be used as timestamp | |
:param case_id_key: (if provided, otherwise default) the attribute to be used as case identifier | |
:param resource_key: the attribute to be used as resource | |
:param start_timestamp_key: the attribute to be used as start timestamp | |
:rtype: ``pd.DataFrame`` | |
.. code-block:: python3 | |
import pm4py | |
temporal_features_df = pm4py.extract_temporal_features_dataframe(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') | |
""" | |
__event_log_deprecation_warning(log) | |
parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) | |
from pm4py.algo.transformation.log_to_features.variants import temporal | |
parameters[temporal.Parameters.GROUPER_FREQ] = grouper_freq | |
parameters[temporal.Parameters.ACTIVITY_COLUMN] = activity_key | |
parameters[temporal.Parameters.TIMESTAMP_COLUMN] = timestamp_key | |
if case_id_key is not None: | |
parameters[temporal.Parameters.CASE_ID_COLUMN] = case_id_key | |
parameters[temporal.Parameters.START_TIMESTAMP_COLUMN] = start_timestamp_key | |
parameters[temporal.Parameters.RESOURCE_COLUMN] = resource_key | |
return temporal.apply(log, parameters=parameters) | |
def extract_target_vector(log: Union[EventLog, pd.DataFrame], variant: str, activity_key="concept:name", timestamp_key="time:timestamp", case_id_key="case:concept:name") -> Tuple[Any, List[str]]: | |
""" | |
Extracts from a log object the target vector for a specific ML use case | |
(next activity, next time, remaining time) | |
:param log: log object (event log / Pandas dataframe) | |
:param variant: variant of the algorithm to be used: next_activity, next_time, remaining_time | |
:param activity_key: the attribute to be used as activity | |
:param timestamp_key: the attribute to be used as timestamp | |
:param case_id_key: the attribute to be used as case identifier | |
:rtype: ``Tuple[Any, List[str]]`` | |
.. code-block:: python3 | |
import pm4py | |
vector_next_act, class_next_act = pm4py.extract_target_vector(log, 'next_activity', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') | |
vector_next_time, class_next_time = pm4py.extract_target_vector(log, 'next_time', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') | |
vector_rem_time, class_rem_time = pm4py.extract_target_vector(log, 'remaining_time', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') | |
""" | |
__event_log_deprecation_warning(log) | |
parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) | |
from pm4py.algo.transformation.log_to_target import algorithm as log_to_target | |
var_map = {"next_activity": log_to_target.Variants.NEXT_ACTIVITY, "next_time": log_to_target.Variants.NEXT_TIME, | |
"remaining_time": log_to_target.Variants.REMAINING_TIME} | |
if variant not in var_map: | |
raise Exception( | |
"please provide the variant between: next_activity, next_time, remaining_time") | |
target, classes = log_to_target.apply(log, variant=var_map[variant], parameters=parameters) | |
return target, classes | |