Spaces:
Running
Running
''' | |
This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de). | |
PM4Py is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
PM4Py is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with PM4Py. If not, see <https://www.gnu.org/licenses/>. | |
''' | |
from pm4py.objects.conversion.log import converter as log_converter | |
from pm4py.algo.transformation.log_to_features import algorithm as features_extractor | |
from enum import Enum | |
from pm4py.util import exec_utils | |
from pm4py.objects.log.obj import EventLog, EventStream | |
import pandas as pd | |
import numpy as np | |
from typing import Optional, Dict, Any, Generator, Union | |
from copy import copy | |
class Parameters(Enum): | |
SKLEARN_CLUSTERER = "sklearn_clusterer" | |
def apply(log: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[Dict[Any, Any]] = None) -> Generator[EventLog, None, None]: | |
""" | |
Cluster the event log, based on the extraction of profiles for the traces of the event log | |
(by means of the feature extraction proposed in pm4py) and the application of a Scikit learn clusterer | |
(default: K-means with two clusters) | |
Implements the approach described in: | |
Song, Minseok, Christian W. Günther, and Wil MP Van der Aalst. "Trace clustering in process mining." Business Process Management Workshops: BPM 2008 International Workshops, Milano, Italy, September 1-4, 2008. Revised Papers 6. Springer Berlin Heidelberg, 2009. | |
Parameters | |
---------------- | |
log | |
Event log | |
parameters | |
Parameters of the feature extraction, including: | |
- Parameters.SKLEARN_CLUSTERER => the Scikit-Learn clusterer to be used (default: KMeans(n_clusters=2, random_state=0, n_init="auto")) | |
Returns | |
--------------- | |
generator | |
Generator of logs (clusters) | |
""" | |
if parameters is None: | |
parameters = {} | |
from pm4py.util import ml_utils | |
clusterer = exec_utils.get_param_value(Parameters.SKLEARN_CLUSTERER, parameters, ml_utils.KMeans(n_clusters=2, random_state=0, n_init="auto")) | |
if "enable_activity_def_representation" not in parameters: | |
parameters["enable_activity_def_representation"] = True | |
if "enable_succ_def_representation" not in parameters: | |
parameters["enable_succ_def_representation"] = True | |
conv_parameters = copy(parameters) | |
conv_parameters["stream_postprocessing"] = True | |
log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=conv_parameters) | |
data, feature_names = features_extractor.apply(log, parameters=parameters) | |
data = np.array([np.array(x) for x in data]) | |
clusters = clusterer.fit_predict(data) | |
max_clu = max(clusters) | |
clust_idxs = {i: list() for i in range(max_clu+1)} | |
for i in range(len(clusters)): | |
clust_idxs[clusters[i]].append(i) | |
for i in clust_idxs: | |
clust_log = EventLog() | |
for j in clust_idxs[i]: | |
clust_log.append(log[j]) | |
#clust_log = log_converter.apply(clust_log, variant=log_converter.Variants.TO_DATA_FRAME, parameters=parameters) | |
yield clust_log | |