kota
initial commit
e60e568
raw
history blame
3.53 kB
'''
This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).
PM4Py is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
PM4Py is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PM4Py. If not, see <https://www.gnu.org/licenses/>.
'''
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.algo.transformation.log_to_features import algorithm as features_extractor
from enum import Enum
from pm4py.util import exec_utils
from pm4py.objects.log.obj import EventLog, EventStream
import pandas as pd
import numpy as np
from typing import Optional, Dict, Any, Generator, Union
from copy import copy
class Parameters(Enum):
SKLEARN_CLUSTERER = "sklearn_clusterer"
def apply(log: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[Dict[Any, Any]] = None) -> Generator[EventLog, None, None]:
"""
Cluster the event log, based on the extraction of profiles for the traces of the event log
(by means of the feature extraction proposed in pm4py) and the application of a Scikit learn clusterer
(default: K-means with two clusters)
Implements the approach described in:
Song, Minseok, Christian W. Günther, and Wil MP Van der Aalst. "Trace clustering in process mining." Business Process Management Workshops: BPM 2008 International Workshops, Milano, Italy, September 1-4, 2008. Revised Papers 6. Springer Berlin Heidelberg, 2009.
Parameters
----------------
log
Event log
parameters
Parameters of the feature extraction, including:
- Parameters.SKLEARN_CLUSTERER => the Scikit-Learn clusterer to be used (default: KMeans(n_clusters=2, random_state=0, n_init="auto"))
Returns
---------------
generator
Generator of logs (clusters)
"""
if parameters is None:
parameters = {}
from pm4py.util import ml_utils
clusterer = exec_utils.get_param_value(Parameters.SKLEARN_CLUSTERER, parameters, ml_utils.KMeans(n_clusters=2, random_state=0, n_init="auto"))
if "enable_activity_def_representation" not in parameters:
parameters["enable_activity_def_representation"] = True
if "enable_succ_def_representation" not in parameters:
parameters["enable_succ_def_representation"] = True
conv_parameters = copy(parameters)
conv_parameters["stream_postprocessing"] = True
log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=conv_parameters)
data, feature_names = features_extractor.apply(log, parameters=parameters)
data = np.array([np.array(x) for x in data])
clusters = clusterer.fit_predict(data)
max_clu = max(clusters)
clust_idxs = {i: list() for i in range(max_clu+1)}
for i in range(len(clusters)):
clust_idxs[clusters[i]].append(i)
for i in clust_idxs:
clust_log = EventLog()
for j in clust_idxs[i]:
clust_log.append(log[j])
#clust_log = log_converter.apply(clust_log, variant=log_converter.Variants.TO_DATA_FRAME, parameters=parameters)
yield clust_log