Spaces:

linpershey
/

process_mining

Running

kota

initial commit

e60e568 11 months ago

15.8 kB

	'''
	This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

	PM4Py is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	PM4Py is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with PM4Py. If not, see <https://www.gnu.org/licenses/>.
	'''
	__doc__ = """
	The ``pm4py.ml`` module contains the machine learning features offered in ``pm4py``
	"""

	from typing import Union, Tuple, Any, List, Collection, Optional
	import pandas as pd
	import numpy as np
	from pm4py.objects.ocel.obj import OCEL
	from pm4py.objects.log.obj import EventLog, EventStream
	from pm4py.objects.conversion.log import converter as log_converter
	from pm4py.utils import __event_log_deprecation_warning
	import random
	from pm4py.util.pandas_utils import check_is_pandas_dataframe, check_pandas_dataframe_columns
	from pm4py.utils import get_properties, constants, pandas_utils


	def split_train_test(log: Union[EventLog, pd.DataFrame], train_percentage: float = 0.8, case_id_key="case:concept:name") -> Union[
	Tuple[EventLog, EventLog], Tuple[pd.DataFrame, pd.DataFrame]]:
	"""
	Split an event log in a training log and a test log (for machine learning purposes).
	Returns the training and the test event log.

	:param log: event log / Pandas dataframe
	:param train_percentage: fraction of traces to be included in the training log (from 0.0 to 1.0)
	:param case_id_key: attribute to be used as case identifier
	:rtype: ``Union[Tuple[EventLog, EventLog], Tuple[pd.DataFrame, pd.DataFrame]]``

	.. code-block:: python3

	import pm4py

	train_df, test_df = pm4py.split_train_test(dataframe, train_percentage=0.75)
	"""
	__event_log_deprecation_warning(log)

	if check_is_pandas_dataframe(log):
	check_pandas_dataframe_columns(log)
	cases = pandas_utils.format_unique(log[case_id_key].unique())
	train_cases = set()
	test_cases = set()
	for c in cases:
	r = random.random()
	if r <= train_percentage:
	train_cases.add(c)
	else:
	test_cases.add(c)
	train_df = log[log[case_id_key].isin(train_cases)]
	test_df = log[log[case_id_key].isin(test_cases)]
	return train_df, test_df
	else:
	from pm4py.objects.log.util import split_train_test
	return split_train_test.split(log, train_percentage=train_percentage)


	def get_prefixes_from_log(log: Union[EventLog, pd.DataFrame], length: int, case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]:
	"""
	Gets the prefixes of a log of a given length. The returned log object contain the prefixes:
	- if a trace has lower or identical length, it is included as-is
	- if a trace has greater length, it is cut

	:param log: event log / Pandas dataframe
	:param length: length
	:param case_id_key: attribute to be used as case identifier
	:rtype: ``Union[EventLog, pd.DataFrame]``

	.. code-block:: python3

	import pm4py

	trimmed_df = pm4py.get_prefixes_from_log(dataframe, length=5, case_id_key='case:concept:name')
	"""
	__event_log_deprecation_warning(log)

	if check_is_pandas_dataframe(log):
	check_pandas_dataframe_columns(log, case_id_key=case_id_key)
	from pm4py.util import pandas_utils
	log = pandas_utils.insert_ev_in_tr_index(log, case_id=case_id_key)
	return log[log[constants.DEFAULT_INDEX_IN_TRACE_KEY] <= (length-1)]
	else:
	from pm4py.objects.log.util import get_prefixes
	return get_prefixes.get_prefixes_from_log(log, length)


	def extract_outcome_enriched_dataframe(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name",
	timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name",
	start_timestamp_key: str = "time:timestamp") -> pd.DataFrame:
	"""
	Inserts additional columns in the dataframe which are computed on the overall case, so they model the
	outcome of the case.

	:param log: event log / Pandas dataframe
	:param activity_key: attribute to be used for the activity
	:param timestamp_key: attribute to be used for the timestamp
	:param case_id_key: attribute to be used as case identifier
	:param start_timestamp_key: attribute to be used as start timestamp
	:rtype: ``pd.DataFrame``

	.. code-block:: python3

	import pm4py

	enriched_df = pm4py.extract_outcome_enriched_dataframe(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', start_timestamp_key='time:timestamp')

	"""
	__event_log_deprecation_warning(log)

	properties = get_properties(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key)

	log = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME, parameters=properties)

	from pm4py.util import pandas_utils

	fea_df = extract_features_dataframe(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key, include_case_id=True)
	log2 = pandas_utils.insert_case_arrival_finish_rate(log.copy(), timestamp_column=timestamp_key, case_id_column=case_id_key, start_timestamp_column=start_timestamp_key)
	log2 = pandas_utils.insert_case_service_waiting_time(log2.copy(), timestamp_column=timestamp_key, case_id_column=case_id_key, start_timestamp_column=start_timestamp_key)

	return log2.merge(fea_df, left_on=case_id_key, right_on=case_id_key)


	def extract_features_dataframe(log: Union[EventLog, pd.DataFrame], str_tr_attr=None, num_tr_attr=None, str_ev_attr=None, num_ev_attr=None, str_evsucc_attr=None, activity_key="concept:name", timestamp_key="time:timestamp", case_id_key=None, resource_key="org:resource", include_case_id: bool = False, **kwargs) -> pd.DataFrame:
	"""
	Extracts a dataframe containing the features of each case of the provided log object

	:param log: log object (event log / Pandas dataframe)
	:param str_tr_attr: (if provided) string attributes at the case level which should be extracted as features
	:param num_tr_attr: (if provided) numeric attributes at the case level which should be extracted as features
	:param str_ev_attr: (if provided) string attributes at the event level which should be extracted as features (one-hot encoding)
	:param num_ev_attr: (if provided) numeric attributes at the event level which should be extracted as features (last value per attribute in a case)
	:param activity_key: the attribute to be used as activity
	:param timestamp_key: the attribute to be used as timestamp
	:param case_id_key: (if provided, otherwise default) the attribute to be used as case identifier
	:param resource_key: the attribute to be used as resource
	:param include_case_id: includes the case identifier column in the features table
	:rtype: ``pd.DataFrame``

	.. code-block:: python3

	import pm4py

	features_df = pm4py.extract_features_dataframe(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp')
	"""
	__event_log_deprecation_warning(log)

	parameters = {}
	if kwargs is not None:
	parameters = kwargs

	properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key)
	for prop in properties:
	parameters[prop] = properties[prop]

	parameters["str_tr_attr"] = str_tr_attr
	parameters["num_tr_attr"] = num_tr_attr
	parameters["str_ev_attr"] = str_ev_attr
	parameters["num_ev_attr"] = num_ev_attr
	parameters["str_evsucc_attr"] = str_evsucc_attr
	parameters["add_case_identifier_column"] = include_case_id

	from pm4py.algo.transformation.log_to_features import algorithm as log_to_features

	if check_is_pandas_dataframe(log):
	check_pandas_dataframe_columns(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key)

	data, feature_names = log_to_features.apply(log, parameters=parameters)

	return pandas_utils.instantiate_dataframe(data, columns=feature_names)


	def extract_ocel_features(ocel: OCEL, obj_type: str, enable_object_lifecycle_paths: bool = True, enable_object_work_in_progress: bool = False, object_str_attributes: Optional[Collection[str]] = None, object_num_attributes: Optional[Collection[str]] = None, include_obj_id: bool = False, debug: bool = False) -> pd.DataFrame:
	"""
	Extracts from an object-centric event log a set of features (returned as dataframe) computed on the OCEL
	for the objects of a given object type.

	Implements the approach described in:
	Berti, A., Herforth, J., Qafari, M.S. et al. Graph-based feature extraction on object-centric event logs. Int J Data Sci Anal (2023). https://doi.org/10.1007/s41060-023-00428-2

	:param ocel: object-centric event log
	:param obj_type: object type that should be considered
	:param enable_object_lifecycle_paths: enables the "lifecycle paths" feature
	:param enable_object_work_in_progress: enables the "work in progress" feature (which has an high computational cost)
	:param object_str_attributes: string attributes at the object level to one-hot encode during the feature extraction
	:param object_num_attributes: numeric attributes at the object level to one-hot encode during the feature extraction
	:param include_obj_id: includes the object identifier as column of the "features" dataframe
	:param debug: enables debugging mode (telling at which point of the feature extraction you are)
	:rtype: ``pd.DataFrame``

	.. code-block:: python3

	import pm4py

	ocel = pm4py.read_ocel('log.jsonocel')
	fea_df = pm4py.extract_ocel_features(ocel, "item")
	"""
	if object_str_attributes is None:
	object_str_attributes = []

	if object_num_attributes is None:
	object_num_attributes = []

	parameters = {}
	parameters["filter_per_type"] = obj_type
	parameters["enable_object_lifecycle_paths"] = enable_object_lifecycle_paths
	parameters["enable_object_work_in_progress"] = enable_object_work_in_progress
	parameters["enable_object_str_attributes"] = len(object_str_attributes) > 0
	parameters["enable_object_num_attributes"] = len(object_num_attributes) > 0
	parameters["str_obj_attr"] = object_str_attributes
	parameters["num_obj_attr"] = object_num_attributes
	parameters["debug"] = debug

	from pm4py.algo.transformation.ocel.features.objects import algorithm as ocel_feature_extraction

	data, feature_names = ocel_feature_extraction.apply(ocel, parameters=parameters)

	dataframe = pandas_utils.instantiate_dataframe(data, columns=feature_names)
	dataframe = dataframe.dropna(how="any", axis=1)
	dataframe = dataframe.select_dtypes(include=np.number)

	if include_obj_id:
	objects_with_type = ocel.objects[[ocel.object_id_column, ocel.object_type_column]].to_dict("records")
	objects_with_type = [x[ocel.object_id_column] for x in objects_with_type if x[ocel.object_type_column] == obj_type]
	dataframe[ocel.object_id_column] = objects_with_type

	return dataframe


	def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grouper_freq="W", activity_key="concept:name", timestamp_key="time:timestamp", case_id_key=None, start_timestamp_key="time:timestamp", resource_key="org:resource") -> pd.DataFrame:
	"""
	Extracts a dataframe containing the temporal features of the provided log object

	Implements the approach described in the paper:
	Pourbafrani, Mahsa, Sebastiaan J. van Zelst, and Wil MP van der Aalst. "Supporting automatic system dynamics model generation for simulation in the context of process mining." International Conference on Business Information Systems. Springer, Cham, 2020.

	:param log: log object (event log / Pandas dataframe)
	:param grouper_freq: the grouping frequency (D, W, M, Y) to use
	:param activity_key: the attribute to be used as activity
	:param timestamp_key: the attribute to be used as timestamp
	:param case_id_key: (if provided, otherwise default) the attribute to be used as case identifier
	:param resource_key: the attribute to be used as resource
	:param start_timestamp_key: the attribute to be used as start timestamp
	:rtype: ``pd.DataFrame``

	.. code-block:: python3

	import pm4py

	temporal_features_df = pm4py.extract_temporal_features_dataframe(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp')
	"""
	__event_log_deprecation_warning(log)

	parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key)

	from pm4py.algo.transformation.log_to_features.variants import temporal

	parameters[temporal.Parameters.GROUPER_FREQ] = grouper_freq
	parameters[temporal.Parameters.ACTIVITY_COLUMN] = activity_key
	parameters[temporal.Parameters.TIMESTAMP_COLUMN] = timestamp_key
	if case_id_key is not None:
	parameters[temporal.Parameters.CASE_ID_COLUMN] = case_id_key
	parameters[temporal.Parameters.START_TIMESTAMP_COLUMN] = start_timestamp_key
	parameters[temporal.Parameters.RESOURCE_COLUMN] = resource_key

	return temporal.apply(log, parameters=parameters)


	def extract_target_vector(log: Union[EventLog, pd.DataFrame], variant: str, activity_key="concept:name", timestamp_key="time:timestamp", case_id_key="case:concept:name") -> Tuple[Any, List[str]]:
	"""
	Extracts from a log object the target vector for a specific ML use case
	(next activity, next time, remaining time)

	:param log: log object (event log / Pandas dataframe)
	:param variant: variant of the algorithm to be used: next_activity, next_time, remaining_time
	:param activity_key: the attribute to be used as activity
	:param timestamp_key: the attribute to be used as timestamp
	:param case_id_key: the attribute to be used as case identifier
	:rtype: ``Tuple[Any, List[str]]``

	.. code-block:: python3

	import pm4py

	vector_next_act, class_next_act = pm4py.extract_target_vector(log, 'next_activity', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name')
	vector_next_time, class_next_time = pm4py.extract_target_vector(log, 'next_time', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name')
	vector_rem_time, class_rem_time = pm4py.extract_target_vector(log, 'remaining_time', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name')

	"""
	__event_log_deprecation_warning(log)

	parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key)

	from pm4py.algo.transformation.log_to_target import algorithm as log_to_target

	var_map = {"next_activity": log_to_target.Variants.NEXT_ACTIVITY, "next_time": log_to_target.Variants.NEXT_TIME,
	"remaining_time": log_to_target.Variants.REMAINING_TIME}

	if variant not in var_map:
	raise Exception(
	"please provide the variant between: next_activity, next_time, remaining_time")

	target, classes = log_to_target.apply(log, variant=var_map[variant], parameters=parameters)
	return target, classes