Spaces:

linpershey
/

process_mining

Running

process_mining / pm4py /algo /clustering /profiles /variants /sklearn_profiles.py

kota

initial commit

e60e568 11 months ago

3.53 kB

	'''
	This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

	PM4Py is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	PM4Py is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with PM4Py. If not, see <https://www.gnu.org/licenses/>.
	'''
	from pm4py.objects.conversion.log import converter as log_converter
	from pm4py.algo.transformation.log_to_features import algorithm as features_extractor
	from enum import Enum
	from pm4py.util import exec_utils
	from pm4py.objects.log.obj import EventLog, EventStream
	import pandas as pd
	import numpy as np
	from typing import Optional, Dict, Any, Generator, Union
	from copy import copy


	class Parameters(Enum):
	SKLEARN_CLUSTERER = "sklearn_clusterer"


	def apply(log: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[Dict[Any, Any]] = None) -> Generator[EventLog, None, None]:
	"""
	Cluster the event log, based on the extraction of profiles for the traces of the event log
	(by means of the feature extraction proposed in pm4py) and the application of a Scikit learn clusterer
	(default: K-means with two clusters)

	Implements the approach described in:
	Song, Minseok, Christian W. Günther, and Wil MP Van der Aalst. "Trace clustering in process mining." Business Process Management Workshops: BPM 2008 International Workshops, Milano, Italy, September 1-4, 2008. Revised Papers 6. Springer Berlin Heidelberg, 2009.

	Parameters
	----------------
	log
	Event log
	parameters
	Parameters of the feature extraction, including:
	- Parameters.SKLEARN_CLUSTERER => the Scikit-Learn clusterer to be used (default: KMeans(n_clusters=2, random_state=0, n_init="auto"))

	Returns
	---------------
	generator
	Generator of logs (clusters)
	"""
	if parameters is None:
	parameters = {}

	from pm4py.util import ml_utils
	clusterer = exec_utils.get_param_value(Parameters.SKLEARN_CLUSTERER, parameters, ml_utils.KMeans(n_clusters=2, random_state=0, n_init="auto"))

	if "enable_activity_def_representation" not in parameters:
	parameters["enable_activity_def_representation"] = True

	if "enable_succ_def_representation" not in parameters:
	parameters["enable_succ_def_representation"] = True

	conv_parameters = copy(parameters)
	conv_parameters["stream_postprocessing"] = True

	log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=conv_parameters)
	data, feature_names = features_extractor.apply(log, parameters=parameters)
	data = np.array([np.array(x) for x in data])

	clusters = clusterer.fit_predict(data)
	max_clu = max(clusters)
	clust_idxs = {i: list() for i in range(max_clu+1)}

	for i in range(len(clusters)):
	clust_idxs[clusters[i]].append(i)

	for i in clust_idxs:
	clust_log = EventLog()
	for j in clust_idxs[i]:
	clust_log.append(log[j])
	#clust_log = log_converter.apply(clust_log, variant=log_converter.Variants.TO_DATA_FRAME, parameters=parameters)

	yield clust_log