import matplotlib as mpl import matplotlib.colors as mcolors import matplotlib.pyplot as plt import numpy as np import os import pandas as pd import seaborn as sns import os import glob from collections import defaultdict from gedi.generator import get_tasks from gedi.utils.io_helpers import get_keys_abbreviation from gedi.utils.io_helpers import read_csvs, select_instance from gedi.utils.param_keys import PLOT_TYPE, PROJECTION, EXPLAINED_VAR, PLOT_3D_MAP from gedi.utils.param_keys import OUTPUT_PATH, PIPELINE_STEP from gedi.utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, PLOT_REFERENCE_FEATURE from gedi.utils.param_keys.plotter import REAL_EVENTLOG_PATH, FONT_SIZE, BOXPLOT_WIDTH from matplotlib.axes import Axes from matplotlib.figure import Figure from matplotlib.lines import Line2D from sklearn.preprocessing import Normalizer, StandardScaler from sklearn.decomposition import PCA def insert_newlines(string, every=140): return '\n'.join(string[i:i+every] for i in range(0, len(string), every)) class MyPlotter: def __init__(self, interactive: bool = True, title_prefix: str = '', for_paper: bool = False): self.fig: Figure = Figure() self.axes: Axes = Axes(self.fig, [0, 0, 0, 0]) self.interactive: bool = interactive self.title_prefix: str = title_prefix self.colors: dict = mcolors.TABLEAU_COLORS self.for_paper: bool = for_paper if self.interactive: mpl.use('TkAgg') if self.for_paper: self.fontsize = 18 else: self.fontsize = 10 def _set_figure_title(self): self.fig.suptitle(self.title_prefix) def _post_processing(self): if not self.for_paper: self._set_figure_title() plt.show() class ModelResultPlotter(MyPlotter): def plot_models(self, model_results, plot_type='', plot_tics=False, components=None): """ Plots the model results in 2d-coordinate system next to each other. Alternatively with tics of the components can be plotted under the figures when `plot_tics` is True :param model_results: list of dictionary dict should contain the keys: 'model', 'projection', 'title_prefix' (optional) :param plot_type: param_key.plot_type :param plot_tics: bool (default: False) Plots the component tics under the base figures if True :param components: int Number of components used for the reduced """ if plot_tics: self.fig, self.axes = plt.subplots(components + 1, len(model_results), constrained_layout=True, figsize=(10,8)) # subplots(rows, columns) main_axes = self.axes[0] # axes[row][column] if len(model_results) == 1: for component_nr in range(components + 1)[1:]: self._plot_time_tics(self.axes[component_nr], model_results[DUMMY_ZERO][PROJECTION], component=component_nr) else: for i, result in enumerate(model_results): df_pca = pd.DataFrame(result[PROJECTION], columns=["PC1", "PC2"]) sns.scatterplot(ax=self.axes[0][i], data=df_pca, x="PC1", y="PC2", palette="bright", hue=['']*len(df_pca), alpha=0.9, s=100) try: self.axes[0][i].set_xlabel(f"PC1 ({np.round(result[EXPLAINED_VAR][0]*100, 2)}% explained variance)") self.axes[0][i].set_ylabel(f"PC2 ({np.round(result[EXPLAINED_VAR][1]*100, 2)}% explained variance)") except TypeError: self.axes[0][i].set_xlabel(f"TSNE_1") self.axes[0][i].set_ylabel(f"TSNE_2") for component_nr in range(components + 1)[1:]: self._plot_time_tics(self.axes[component_nr][i], result[PROJECTION], component=component_nr) else: self.fig, self.axes = plt.subplots(1, len(model_results), constrained_layout=True) main_axes = self.axes plt.show() @staticmethod def _plot_time_tics(ax, projection, component): """ Plot the time tics on a specific axis :param ax: axis :param projection: :param component: :return: """ ax.cla() ax.set_xlabel('Time step') ax.set_ylabel('Component {}'.format(component)) ax.label_outer() ax.plot(projection[:, component - 1]) class ArrayPlotter(MyPlotter): def __init__(self, interactive=False, title_prefix='', x_label='', y_label='', bottom_text=None, y_range=None, show_grid=False, xtick_start=0, for_paper=False): super().__init__(interactive, title_prefix, for_paper) self.x_label = x_label self.y_label = y_label self.bottom_text = bottom_text self.range_tuple = y_range self._activate_legend = False self.show_grid = show_grid self.xtick_start = xtick_start def _post_processing(self, legend_outside=False): # self.axes.set_title(self.title_prefix) self.axes.set_xlabel(self.x_label, fontsize=self.fontsize) self.axes.set_ylabel(self.y_label, fontsize=self.fontsize) # plt.xticks(fontsize=self.fontsize) # plt.yticks(fontsize=self.fontsize) if self.bottom_text is not None: self.fig.text(0.01, 0.01, self.bottom_text, fontsize=self.fontsize) self.fig.tight_layout() self.fig.subplots_adjust(bottom=(self.bottom_text.count('\n') + 1) * 0.1) else: self.fig.tight_layout() if legend_outside: self.axes.legend(bbox_to_anchor=(0.5, -0.05), loc='upper center', fontsize=8) plt.subplots_adjust(bottom=0.25) elif self._activate_legend: self.axes.legend(fontsize=self.fontsize) if self.range_tuple is not None: self.axes.set_ylim(self.range_tuple) if self.show_grid: plt.grid(True, which='both') plt.minorticks_on() super()._post_processing() def matrix_plot(self, matrix, as_surface='2d', show_values=False): """ Plots the values of a matrix on a 2d or a 3d axes :param matrix: ndarray (2-ndim) matrix, which should be plotted :param as_surface: str Plot as a 3d-surface if value PLOT_3D_MAP else 2d-axes :param show_values: If true, then show the values in the matrix """ c_map = plt.cm.viridis # c_map = plt.cm.seismic if as_surface == PLOT_3D_MAP: x_coordinates = np.arange(matrix.shape[0]) y_coordinates = np.arange(matrix.shape[1]) x_coordinates, y_coordinates = np.meshgrid(x_coordinates, y_coordinates) self.fig = plt.figure() self.axes = self.fig.gca(projection='3d') self.axes.set_zlabel('Covariance Values', fontsize=self.fontsize) im = self.axes.plot_surface(x_coordinates, y_coordinates, matrix, cmap=c_map) else: self.fig, self.axes = plt.subplots(1, 1, dpi=80) im = self.axes.matshow(matrix, cmap=c_map) if show_values: for (i, j), value in np.ndenumerate(matrix): self.axes.text(j, i, '{:0.2f}'.format(value), ha='center', va='center', fontsize=8) if not self.for_paper: self.fig.colorbar(im, ax=self.axes) plt.xticks(np.arange(matrix.shape[1]), np.arange(self.xtick_start, matrix.shape[1] + self.xtick_start)) # plt.xticks(np.arange(matrix.shape[1], step=5), # np.arange(self.xtick_start, matrix.shape[1] + self.xtick_start, step=5)) self._post_processing() def plot_gauss2d(self, x_index: np.ndarray, ydata: np.ndarray, new_ydata: np.ndarray, gauss_fitted: np.ndarray, fit_method: str, statistical_function: callable = np.median): """ Plot the original data (ydata), the new data (new_ydata) where the x-axis-indices is given by (x_index), the (fitted) gauss curve and a line (mean, median) :param x_index: ndarray (1-ndim) range of plotting :param ydata: ndarray (1-ndim) original data :param new_ydata: ndarray (1-ndim) the changed new data :param gauss_fitted: ndarray (1-ndim) the fitted curve on the new data :param fit_method: str the name of the fitting method :param statistical_function: callable Some statistical numpy function :return: """ self.fig, self.axes = plt.subplots(1, 1, dpi=80) self.axes.plot(x_index, gauss_fitted, '-', label=f'fit {fit_method}') # self.axes.plot(x_index, gauss_fitted, ' ') self.axes.plot(x_index, ydata, '.', label='original data') # self.axes.plot(x_index, ydata, ' ') statistical_value = np.full(x_index.shape, statistical_function(ydata)) if self.for_paper: function_label = 'threshold' else: function_label = function_name(statistical_function) self._activate_legend = True self.axes.plot(x_index, statistical_value, '-', label=function_label) # self.axes.plot(x_index, statistical_value, ' ') # self.axes.plot(x_index, new_ydata, '.', label='re-scaled data') self.axes.plot(x_index, new_ydata, ' ') self._post_processing() def plot_2d(self, ndarray_data, statistical_func=None): self.fig, self.axes = plt.subplots(1, 1) self.axes.plot(ndarray_data, '-') if statistical_func is not None: statistical_value = statistical_func(ndarray_data) statistical_value_line = np.full(ndarray_data.shape, statistical_value) self.axes.plot(statistical_value_line, '-', label=f'{function_name(statistical_func)}: {statistical_value:.4f}') self._activate_legend = False self._post_processing() def plot_merged_2ds(self, ndarray_dict: dict, statistical_func=None): self.fig, self.axes = plt.subplots(1, 1, dpi=80) self.title_prefix += f'with {function_name(statistical_func)}' if statistical_func is not None else '' for key, ndarray_data in ndarray_dict.items(): # noinspection PyProtectedMember color = next(self.axes._get_lines.prop_cycler)['color'] if statistical_func is not None: if isinstance(ndarray_data, list): ndarray_data = np.asarray(ndarray_data) self.axes.plot(ndarray_data, '-', color=color) statistical_value = statistical_func(ndarray_data) statistical_value_line = np.full(ndarray_data.shape, statistical_value) self.axes.plot(statistical_value_line, '--', label=f'{key.strip()}: {statistical_value:.4f}', color=color) else: self.axes.plot(ndarray_data, '-', color=color, label=f'{key.strip()[:35]}') self._activate_legend = True self._post_processing() class BenchmarkPlotter: def __init__(self, benchmark_results, output_path = None): self.plot_miners_correlation(benchmark_results, output_path=output_path) self.plot_miner_feat_correlation(benchmark_results, output_path=output_path) self.plot_miner_feat_correlation(benchmark_results, mean='methods', output_path=output_path) def plot_miner_feat_correlation(self, benchmark, mean='metrics', output_path=None): df = benchmark.loc[:, benchmark.columns!='log'] corr = df.corr() if mean == 'methods': for method in ['inductive', 'heu', 'ilp']: method_cols = [col for col in corr.columns if col.startswith(method)] corr[method+'_avg'] = corr.loc[:, corr.columns.isin(method_cols)].mean(axis=1) elif mean == 'metrics': for metric in ['fitness', 'precision', 'generalization', 'simplicity']: metric_cols = [col for col in corr.columns if col.endswith(metric)] corr[metric+'_avg'] = corr.loc[:, corr.columns.isin(metric_cols)].mean(axis=1) avg_cols = [col for col in corr.columns if col.endswith('_avg')] benchmark_result_cols = [col for col in corr.columns if col.startswith('inductive') or col.startswith('heu') or col.startswith('ilp')] corr = corr[:][~corr.index.isin(benchmark_result_cols)] fig, axes = plt.subplots( 1, len(avg_cols), figsize=(15,10)) for i, ax in enumerate(axes): cbar = True if i==3 else False corr = corr.sort_values(avg_cols[i], axis=0, ascending=False) b= sns.heatmap(corr[[avg_cols[i]]][:], ax=ax, xticklabels=[avg_cols[i]], yticklabels=corr.index, cbar=cbar) plt.subplots_adjust(wspace = 1, top=0.9, left=0.15) fig.suptitle(f"Feature and performance correlation per {mean.split('s')[0]} for {len(benchmark)} event-logs") if output_path != None: output_path = output_path+f"/minperf_corr_{mean.split('s')[0]}_el{len(benchmark)}.jpg" fig.savefig(output_path) print(f"SUCCESS: Saved correlation plot at {output_path}") #plt.show() def plot_miners_correlation(self, benchmark, output_path=None): benchmark_result_cols = [col for col in benchmark.columns if col.startswith('inductive') or col.startswith('heu') or col.startswith('ilp')] df = benchmark.loc[:, benchmark.columns!='log'] df = df.loc[:, df.columns.isin(benchmark_result_cols)] corr = df.corr() fig, ax = plt.subplots(figsize=(15,10)) b= sns.heatmap(corr, ax=ax, xticklabels=corr.columns.values, yticklabels=corr.columns.values) plt.title(f"Miners and performance correlation for {len(benchmark)} event-logs", loc='center') if output_path != None: output_path = output_path+f"/minperf_corr_el{len(benchmark)}.jpg" fig.savefig(output_path) print(f"SUCCESS: Saved correlation plot at {output_path}") #plt.show() class FeaturesPlotter: def __init__(self, features, params=None): output_path = params[OUTPUT_PATH] if OUTPUT_PATH in params else None plot_type = f", plot_type='{params[PLOT_TYPE]}'" if params.get(PLOT_TYPE) else "" font_size = f", font_size='{params[FONT_SIZE]}'" if params.get(FONT_SIZE) else "" boxplot_w = f", boxplot_w='{params[BOXPLOT_WIDTH]}'" if params.get(BOXPLOT_WIDTH) else "" LEGEND = ", legend=True" if params.get(PIPELINE_STEP) else "" source_name = os.path.split(params['input_path'])[-1].replace(".csv", "")+"_" #output_path = os.path.join(output_path, source_name) if REAL_EVENTLOG_PATH in params: real_eventlogs_path=params[REAL_EVENTLOG_PATH] real_eventlogs = pd.read_csv(real_eventlogs_path) fig, output_path = eval(f"self.plot_violinplot_multi(features, output_path, real_eventlogs, source='{source_name}' {plot_type}{font_size}{boxplot_w}{LEGEND})") else: fig, output_path = eval(f"self.plot_violinplot_single(features, output_path, source='{source_name}' {plot_type}{font_size}{boxplot_w})") if output_path != None: os.makedirs(os.path.split(output_path)[0], exist_ok=True) fig.savefig(output_path) print(f"SUCCESS: Saved {plot_type} plot in {output_path}") def plot_violinplot_single(self, features, output_path=None, source="_", plot_type="violinplot", font_size=16, boxplot_w=16): columns = features.columns[1:] df1=features.select_dtypes(exclude=['object']) fig, axes = plt.subplots(len(df1.columns),1, figsize=(int(boxplot_w),len(df1.columns))) for i, ax in enumerate(axes): eval(f"sns.{plot_type}(data=df1, x=df1[df1.columns[i]], ax=ax)") fig.suptitle(f"{len(columns)} features distribution for {len(features)} generated event-logs", fontsize=font_size, y=1) fig.tight_layout() output_path=output_path+f"/{plot_type}s_{source}{len(columns)}fts_{len(df1)}gEL.jpg" return fig, output_path def plot_violinplot_multi(self, features, output_path, real_eventlogs, source="_", plot_type="violinplot", font_size=24, legend=False, boxplot_w=16): LOG_NATURE = "Log Nature" GENERATED = "Generated" REAL = "Real" FONT_SIZE=font_size alpha = 0.7 color = sns.color_palette("bright") markers = ['o','X'] inner_param = '' features[LOG_NATURE] = GENERATED real_eventlogs[LOG_NATURE] = REAL bdf = pd.concat([features, real_eventlogs]) bdf = bdf[features.columns] bdf = bdf.dropna(axis='rows') columns = bdf.columns[3:] dmf1=bdf.select_dtypes(exclude=['object']) if plot_type == 'violinplot': inner_param = 'inner = None,' fig, axes = plt.subplots(len(dmf1.columns),1, figsize=(int(boxplot_w),len(dmf1.columns)*1.25), dpi=300) if isinstance(axes, Axes): # not isinstance(axes, list): axes = [axes] #nature_types = set(['Generated', 'Real'])#set(bdf['Log Nature'].unique()) nature_types = list(reversed(bdf['Log Nature'].unique()[:2])) for i, ax in enumerate(axes): for j, nature in enumerate(nature_types): eval(f"sns.{plot_type}(data=bdf[bdf['Log Nature']==nature], x=dmf1.columns[i], palette=[color[j]], {inner_param} ax=ax)") eval(f"sns.stripplot(data=bdf[bdf['Log Nature']==nature], x=dmf1.columns[i], palette=[color[j]], marker=markers[j], {inner_param} ax=ax)") for collection in ax.collections: collection.set_alpha(alpha) for patch in ax.patches: r, g, b, a = patch.get_facecolor() patch.set_facecolor((r, g, b, alpha)) custom_lines = [ Line2D([0], [0], color=color[nature], lw=4, alpha=alpha) for nature in [0,1,2] ] #ax.legend(custom_lines, bdf['Log Nature'].unique(), title= "Log Nature") #sns.set_context("paper", font_scale=1.5) ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE) ax.tick_params(axis='both', which='minor', labelsize=FONT_SIZE) ax.set_xlabel(dmf1.columns[i], fontsize=FONT_SIZE) if legend: fig.legend(custom_lines, nature_types, loc='upper right', ncol=len(nature_types), prop={'size': FONT_SIZE}) plt.legend(fontsize=FONT_SIZE) #fig.suptitle(f"{len(features.columns)-2} features distribution for {len(real_eventlogs[real_eventlogs['Log Nature'].isin(nature_types)])} real and {len(features)} generated event-logs", fontsize=16, y=1) plt.yticks(fontsize=FONT_SIZE) plt.xticks(fontsize=FONT_SIZE) fig.tight_layout() output_path = output_path+f"/{plot_type}s_{source}{len(columns)}fts_{len(features)}gEL_of{len(bdf[bdf['Log Nature'].isin(nature_types)])}.jpg" return fig, output_path class AugmentationPlotter(object): """Plotter for the augmented features. If just 2 features are examined, the plotter outputs a scatterplot with the two features defining the dimensions. IF more than 2 features are examined, a PCA is performed first before the first two principal components are plotted. Parameters ---------- features : pd.DataFrame dataFrame containing the information of the real and synthesized datasets. """ def __init__(self, features, params=None) -> None: output_path = params[OUTPUT_PATH] if OUTPUT_PATH in params else None self.sampler = params['augmentation_params']['method'] eval(f"self.plot_augmented_features(features, output_path)") def plot_augmented_features(self, features, output_path=None) -> None: """Plotting for augmented features. When more than 2 features are selected, the plot will show the result after applying a PCA; otherwise the 2 features are plotted according to the values. Parameters ---------- features : pd.DataFrame DataFrame containing the augmented features output_path : str, optional Path to the output file, by default None """ if len(features.all.columns) < 2: raise AssertionError ("AugmentationPlotter - More than 2 (augmented) features are expected for plotting.") if len(features.all.columns) > 2: self._plot_pca(features, output_path) else: self._plot_2d(features, output_path) def _plot_2d(self, features, output_path=None) -> None: """Fnc for plotting 2D features without any dimension reduction technique being applied. Parameters ---------- features : pd.DataFrame Dataframe containing the augmented features output_path : str, optional Path to the output file, by default None """ col1_name, col2_name = features.all.columns # INIT - settings X = features.all.iloc[:-features.new_samples.shape[0]] X = X.to_numpy() X_aug = features.all.to_numpy() sns.set_theme() fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(24, 8)) fig.suptitle(f'Log Descriptors - real: {X.shape[0]}, synth.: {X_aug.shape[0]-X.shape[0]}', fontsize=16) # Normalizer: applied to each observation -> row values have unit norm normalizer = Normalizer(norm="l2").fit(X) normed_data = normalizer.transform(X_aug) # StandardScaler: applied to features -> col values have unit norm scaler = StandardScaler().fit(X) scaled_data = scaler.transform(X_aug) # PLOT - raw 2d data X_aug = self._add_real_synth_encoding(X_aug, X, X_aug) df_raw = self._convert_to_df(X_aug, [col1_name, col2_name, 'type']) sns.scatterplot(ax=ax1, data=df_raw, x=col1_name, y=col2_name, palette="bright", hue = "type", alpha=0.5, s=100).set_title("Raw data") ax1.get_legend().set_title("") # PLOT - normed 2d data normed_data = self._add_real_synth_encoding(normed_data, X, X_aug) df_normed = self._convert_to_df(normed_data, [col1_name, col2_name, 'type']) sns.scatterplot(ax=ax2, data=df_normed, x=col1_name, y=col2_name, palette="bright", hue = 'type', alpha=0.5, s=100).set_title("Normalized data") ax2.get_legend().set_title("") # PLOT - scaled 2d data scaled_data = self._add_real_synth_encoding(scaled_data, X, X_aug) df_scaled = self._convert_to_df(scaled_data, [col1_name, col2_name, 'type']) sns.scatterplot(ax=ax3, data=df_scaled, x=col1_name, y=col2_name, palette="bright", hue = 'type', alpha=0.5, s=100).set_title("Scaled data") ax3.get_legend().set_title("") plt.tight_layout() # OUTPUT if output_path != None: output_path += f"/augmentation_2d_plot_{col1_name}-{col2_name}_{self.sampler}.jpg" fig.savefig(output_path) print(f"SUCCESS: Saved augmentation pca plot at {output_path}") def _add_real_synth_encoding(self, arr, X, X_aug) -> np.array: """Helper function for adding one additional column to the array in the last column. The last column indicates whether it is a real data (=0) or synthesized (=1). Parameters ---------- arr : np.array data array X : np.array data of real datasets X_aug : np.array data of real datasets and synthesized datasets Returns ------- np.array array containing the data with an additional last column indicating whether the data comes from a real dataset or synthesized one """ real_synth_enc = np.array([0]*X.shape[0] + [1]*(X_aug.shape[0]-X.shape[0])).reshape(-1, 1) return np.hstack ([arr, real_synth_enc]) def _convert_to_df(self, arr, colnames, enc=['real', 'synth']) -> pd.DataFrame: """Converts the attached array to a dataframe. The column names are defined by the respective parameters, where the last column is encoded by the string array of the enc parameter. Parameters ---------- arr : np.array _description_ colnames : list column names of returned dataframe enc : list, optional labels for real vs. generated data, by default ['real', 'synth'] Returns ------- pd.DataFrame dataframe containing the attached data array with encoded values in the last column """ df = pd.DataFrame(arr, columns=colnames) df.loc[df.iloc[:, -1] == 0, colnames[-1]] = enc[0] df.loc[df.iloc[:, -1] == 1, colnames[-1]] = enc[1] return df def _plot_pca(self, features, output_path=None) -> None: """Fnc for plotting features with PCA as dimension reduction technique being applied. Parameters ---------- features : pd.DataFrame DataFrame containing the augmented features output_path : str, optional path to output file, by default None """ # INIT - settings n_features = features.all.shape[1] X = features.all.iloc[:-features.new_samples.shape[0]] X = X.to_numpy() X_aug = features.all.to_numpy() sns.set_theme() fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(24, 8)) fig.suptitle(f'Log Descriptors - real: {X.shape[0]}, synth.: {X_aug.shape[0]-X.shape[0]}', fontsize=16) pca_components = 2 pca = PCA(n_components=pca_components) # Normalizer: applied to each observation -> row values have unit norm normalizer = Normalizer(norm="l2").fit(X) normed_data_real = normalizer.transform(X) normed_data_aug = normalizer.transform(X_aug) # StandardScaler: applied to features -> col values have unit norm scaler = StandardScaler().fit(X) scaled_data_real = scaler.transform(X) scaled_data_aug = scaler.transform(X_aug) # PLOT - PCA on raw input fit_pca = pca.fit(X) X_new = fit_pca.transform(X_aug) X_new = self._add_real_synth_encoding(X_new[:, :pca_components], X, X_aug) df_pca = self._convert_to_df(X_new, ['PC_1', 'PC_2', 'type']) sns.scatterplot(ax=ax1, data=df_pca, x="PC_1", y="PC_2", palette="bright", hue = 'type', alpha=0.5, s=100) ax1.set_xlabel(f"PC1 ({np.round(pca.explained_variance_ratio_[0]*100, 2)}% explained variance)") ax1.set_ylabel(f"PC2 ({np.round(pca.explained_variance_ratio_[1]*100, 2)}% explained variance)") ax1.get_legend().set_title("") # PLOT - PCA on normed data fit_norm_pca = pca.fit(normed_data_real) X_new_normed = fit_norm_pca.transform(normed_data_aug) X_new_normed = self._add_real_synth_encoding(X_new_normed[:, :pca_components], X, X_aug) df_pca_normed = self._convert_to_df(X_new_normed, ['PC_1', 'PC_2', 'type']) sns.scatterplot(ax=ax2, data=df_pca_normed, x="PC_1", y="PC_2", palette="bright", hue = 'type', alpha=0.5, s=100) ax2.set_xlabel(f"PC1 ({np.round(pca.explained_variance_ratio_[0]*100, 2)}% explained variance)") ax2.set_ylabel(f"PC2 ({np.round(pca.explained_variance_ratio_[1]*100, 2)}% explained variance)") ax2.get_legend().set_title("") # PLOT - PCA on scaled data fit_sca_pca = pca.fit(scaled_data_real) X_new_sca = fit_sca_pca.transform(scaled_data_aug) X_new_sca = self._add_real_synth_encoding(X_new_sca[:, :pca_components], X, X_aug) df_pca_scaled = self._convert_to_df(X_new_sca, ['PC_1', 'PC_2', 'type']) sns.scatterplot(ax=ax3, data=df_pca_scaled, x="PC_1", y="PC_2", palette="bright", hue = 'type', alpha=0.5, s=100) ax3.set_xlabel(f"PC1 ({np.round(pca.explained_variance_ratio_[0]*100, 2)}% explained variance)") ax3.set_ylabel(f"PC2 ({np.round(pca.explained_variance_ratio_[1]*100, 2)}% explained variance)") ax3.get_legend().set_title("") plt.tight_layout() # OUTPUT if output_path != None: output_path += f"/augmentation_pca_{n_features}_{self.sampler}.jpg" fig.savefig(output_path) print(f"SUCCESS: Saved augmentation pca plot at {output_path}") class GenerationPlotter(object): def __init__(self, gen_cfg, model_params, output_path, input_path=None): print(f"Running plotter for {len(gen_cfg)} genEL, params {model_params}, output path: {output_path}") self.output_path = output_path self.input_path = input_path self.model_params = model_params if gen_cfg.empty: # Deactivated for tests return if "metafeatures" in gen_cfg.columns: self.gen = gen_cfg.metafeatures self.gen=pd.concat([pd.DataFrame.from_dict(entry, orient="Index").T for entry in self.gen]).reset_index(drop=True) else: self.gen = gen_cfg.reset_index(drop=True) if GENERATOR_PARAMS in model_params: self.tasks, _ = get_tasks(model_params[GENERATOR_PARAMS][EXPERIMENT]) feature_list = list(self.tasks.select_dtypes(exclude=['object']).keys()) ref_feat = None if PLOT_REFERENCE_FEATURE in model_params[GENERATOR_PARAMS]and model_params[GENERATOR_PARAMS][PLOT_REFERENCE_FEATURE] != "": ref_feat = model_params[GENERATOR_PARAMS][PLOT_REFERENCE_FEATURE] reference_feature_list = feature_list if ref_feat is None else [ref_feat] self.plot_settings() if input_path is not None: # plot single reference feature compared to values stored in .csvs if isinstance(input_path, str) and input_path.endswith(".csv"): f_d = pd.read_csv(input_path) f_d = {model_params['reference_feature']: f_d} elif isinstance(input_path, list): self.plot_dist_mx(model_params) else: f_d = read_csvs(input_path, model_params['reference_feature']) tasks, _ = get_tasks(model_params['targets'], reference_feature=model_params['reference_feature']) self.plot_reference_feature_plot(tasks, f_d, model_params['reference_feature']) else: # start all plotting procedures at once self.plot_feat_comparison(feature_list, reference_feature_list) def plot_reference_feature_plot(self, orig_targets, f_dict, reference_feature, resolution=10): fig1, axes = plt.subplots(1, len(f_dict), figsize=(20, 4)) if isinstance(axes,Axes): axes = [axes] fig2, axes_mesh = plt.subplots(1, len(f_dict), figsize=(20, 4), layout='compressed') if isinstance(axes_mesh, Axes): axes_mesh = [axes_mesh] for idx_ax, (k, v) in enumerate(f_dict.items()): if isinstance(orig_targets, pd.DataFrame): targets = orig_targets.copy() elif isinstance(orig_targets, defaultdict): if k not in orig_targets: print(f"[WARNING] {k} not in targets. Only in generated features. Will continue with next feature to compare with") continue targets = orig_targets[k].copy() else: print(f"[ERR] Unknown file format for targets {type(orig_targets)}. Close program (Exit Code: 0).") # Identify NAN values of reference feature target_nan_values_idx_reference = np.where(targets[reference_feature].isna())[0] target_nan_logs_reference = targets.loc[target_nan_values_idx_reference]['log'] # Identify NAN values of competitor feature target_nan_values_idx_competitor = np.where(targets[k].isna())[0] target_nan_logs_competitor = targets.loc[target_nan_values_idx_competitor]['log'] # Collection of indices to drop target_nan_indices = np.unique(np.concatenate((target_nan_values_idx_competitor, target_nan_values_idx_reference))) # Drop NAN values in target DataFrame targets.drop(axis='index', index=target_nan_indices, inplace=True) # Check for indices in generated DataFrame reference_values_idx_reference = v[v['log'].isin(list(target_nan_logs_reference))].index reference_values_idx_competitor = v[v['log'].isin(list(target_nan_logs_competitor))].index # Collection of indices to drop for reference reference_nan_indices = np.unique(np.concatenate((reference_values_idx_reference, reference_values_idx_competitor))) # Drop NAN values in generated DataFrame v.drop(axis='index', index=reference_nan_indices, inplace=True) # Plot generated DataFrame + target DataFrame v.plot.scatter(x=v.columns.get_loc(reference_feature), y=v.columns.get_loc(k), ax=axes[idx_ax], c="red", alpha=0.3) targets.plot.scatter(x=targets.columns.get_loc(reference_feature), y=targets.columns.get_loc(k), ax=axes[idx_ax], c='blue', alpha=0.3) Z = np.zeros([resolution+1, resolution+1]) cnt_Z = np.zeros([resolution+1, resolution+1]) Z.fill(np.nan) min_Z_X = np.min(targets[reference_feature]) min_Z_Y = np.min(targets[k]) max_Z_X = np.max(targets[reference_feature]) max_Z_Y = np.max(targets[k]) step_Z_X = np.round((max_Z_X - min_Z_X) / float(resolution), 4) step_Z_Y = np.round((max_Z_Y - min_Z_Y) / float(resolution), 4) cum_sum=0 for idx in v.index: if isinstance(v, pd.DataFrame) and 'log' in v.columns: c_log = v.loc[idx, 'log'] if c_log in targets['log'].values: gen_entry = targets[targets['log'] == c_log] else: print(f"INFO: no value for {c_log} in generated files.") gen_entry = targets else: gen_entry = targets # Plot connection line axes[idx_ax].plot([v[reference_feature][idx], gen_entry[reference_feature].values[0]], [v[k][idx], gen_entry[k].values[0]], c="green", alpha=0.25) # Plot textual annotation axes[idx_ax].annotate(gen_entry['log'].values[0], (gen_entry[reference_feature].values[0], gen_entry[k].values[0]), fontsize=5) # Compute distance between real and generated dot vec1 = np.array([v[reference_feature][idx], v[k][idx]]) vec2 = np.array([gen_entry[reference_feature].values[0], gen_entry[k].values[0]]) Z_idx = int (np.round((gen_entry[reference_feature].values[0] - min_Z_X) / step_Z_X)) Z_idy = int (np.round((gen_entry[k].values[0] - min_Z_Y) / step_Z_Y)) if np.isnan(Z[Z_idx][Z_idy]): Z[Z_idx][Z_idy] = 0.0 Z[Z_idx][Z_idy] += np.linalg.norm(vec1 - vec2) cnt_Z[Z_idx][Z_idy] += 1 cum_sum += np.linalg.norm(vec1 - vec2) print(f"INFO: Cumulated distances objectives <-> generated features for '{reference_feature}' vs. '{k}': {cum_sum:.4f}") X, Y = np.meshgrid(np.linspace(min_Z_X, max_Z_X, resolution+1), np.linspace(min_Z_Y, max_Z_Y, resolution+1)) cmap = plt.colormaps['viridis_r'] Z[np.isnan(Z)] = np.sqrt(2) cnt_Z[cnt_Z==0] = 1 Z /= cnt_Z colormesh = axes_mesh[idx_ax].pcolormesh(X, Y, Z.T, shading='nearest', cmap=cmap) axes_mesh[idx_ax].set_xlabel(reference_feature) axes_mesh[idx_ax].set_ylabel(k) if idx_ax == (len(f_dict)-1): cbar = fig2.colorbar(colormesh, ax=axes_mesh, orientation='vertical', pad=0.01) cbar.ax.set_ylabel('Feature dist. of generated EDs and objectives',fontsize=8, rotation=90, labelpad=-50) axes[idx_ax].set_title(f"Cumulated distances {cum_sum:.4f}") tasks_keys = f_dict.keys() tasks_keys = list(sorted(tasks_keys)) abbreviations = get_keys_abbreviation(tasks_keys) ref_short_name = get_keys_abbreviation([reference_feature]) fig1_title = f'Feature Comparison - {reference_feature}' fig1.suptitle(fig1_title, fontsize=6) fig1.tight_layout() distance_plot_path = os.path.join(self.output_path, f"plot_genEL{len(self.gen)}_tasks{len(tasks_keys)}_{ref_short_name}_vs_{abbreviations}.png") fig1.savefig(distance_plot_path) print(f"Saved objectives vs. genEL features plot in {distance_plot_path}") fig2.suptitle(f'Meshgrid Comparison - {reference_feature}', fontsize=6) meshgrid_plot_path = os.path.join(self.output_path, f"plot_meshgrid_genEL{len(self.gen)}_tasks{len(tasks_keys)}_{ref_short_name}_vs_{abbreviations}.png") fig2.savefig(meshgrid_plot_path) print(f"Saved meshgrid plot in {meshgrid_plot_path}") def plot_single_comparison(self, tasks, objective1, objective2, ax, ax_cmesh, fig2, axes_meshes, flag_plt_clbar): if len(tasks.select_dtypes(include=['object']).columns)==0: tasks['task']=[f"task_{str(x+1)}" for x in tasks.index.values.tolist()] id_col = tasks.select_dtypes(include=['object']).dropna(axis=1).columns[0] tasks.plot.scatter(x=objective1, y=objective2, ax=ax, alpha=0.3) self.gen.plot.scatter(x=objective1, y=objective2, c="red", ax=ax, alpha=0.3) Z = np.zeros([tasks[objective1].unique().size, tasks[objective2].unique().size]) cnt_Z = np.zeros([tasks[objective1].unique().size, tasks[objective2].unique().size]) Z.fill(np.inf) cum_sum = 0 for idx in tasks.index: if isinstance(tasks, pd.DataFrame) and 'log' in tasks.columns: c_log = tasks.loc[idx, 'log'] if c_log in self.gen['log'].values: gen_entry = self.gen[self.gen['log'] == c_log] else: print(f"INFO: no value for {c_log} in generated files.") gen_entry = self.gen else: gen_entry = self.gen ax.plot([tasks[objective1][idx], gen_entry[objective1].values[0]], [tasks[objective2][idx], gen_entry[objective2].values[0]], c="green", alpha=0.25) ax.annotate(tasks[id_col][idx], (tasks[objective1][idx], tasks[objective2][idx]), fontsize=5) vec1 = np.array([tasks[objective1][idx], tasks[objective2][idx]]) vec2 = np.array([gen_entry[objective1].values[0], gen_entry[objective2].values[0]]) Z_idx = np.where(tasks[objective1].unique() == tasks[objective1][idx])[0][0] Z_idy = np.where(tasks[objective2].unique() == tasks[objective2][idx])[0][0] if np.isinf(Z[Z_idx][Z_idy]): Z[Z_idx][Z_idy] = 0.0 Z[Z_idx][Z_idy] += np.linalg.norm(vec1 - vec2) cnt_Z[Z_idx][Z_idy] += 1 cum_sum += np.linalg.norm(vec1 - vec2) print(f"INFO: Cumulated distances objectives <-> generated features for '{objective1}' vs. '{objective2}':", cum_sum) ax.set_title(f"Cumulated distances {cum_sum:.4f}") X, Y = np.meshgrid(tasks[objective1].unique(), tasks[objective2].unique()) cmap = plt.colormaps['viridis_r'] Z[np.isinf(Z)] = np.sqrt(2) cnt_Z[cnt_Z==0] = 1 Z /= cnt_Z colormesh = ax_cmesh.pcolormesh(X, Y, Z.T, shading='nearest', cmap=cmap) # vmin=0.0, vmax=1.0, cmap=cmap) ax_cmesh.set_xlabel(objective1) ax_cmesh.set_ylabel(objective2) if flag_plt_clbar: fig2.colorbar(colormesh, ax=axes_meshes, orientation='vertical') return colormesh def plot_settings(self): mpl.rc('axes', titlesize=8) # fontsize of the axes title mpl.rc('axes', labelsize=8) # fontsize of the x and y labels mpl.rc('font', size=8) def plot_feat_comparison(self, feature_list, reference_list): len_features = len(feature_list) len_ref_feats = len(reference_list) fig1, axes = plt.subplots(len_ref_feats, len_features) fig2, axes_meshes = plt.subplots(len_ref_feats, len_features, layout='compressed') for idx1, entry1 in enumerate(reference_list): for idx2, entry2 in enumerate(feature_list): if isinstance(axes, Axes): ax = axes ax_cmesh = axes_meshes elif len_ref_feats == 1: ax = axes[idx2] ax_cmesh = axes_meshes[idx2] else: ax = axes[idx1][idx2] ax_cmesh = axes_meshes[idx1][idx2] flag_plt_clbar = False if ((idx2 == (len(feature_list)-1)) & (idx1 == len(reference_list)-1)): flag_plt_clbar = True colormesh = self.plot_single_comparison(self.tasks, entry1, entry2, ax, ax_cmesh, fig2, axes_meshes, flag_plt_clbar) objectives_keys = self.tasks.select_dtypes(exclude=['object']).columns objectives_keys = list(sorted(objectives_keys)) abbreviations = get_keys_abbreviation(objectives_keys) fig1_title = f'Feature Comparison with {self.model_params[GENERATOR_PARAMS]}' fig1.suptitle(insert_newlines(fig1_title), fontsize=6) fig1.tight_layout() distance_plot_path = os.path.join(self.output_path, f"eval_genEL{len(self.gen)}_objectives{len(objectives_keys)}_trials{self.model_params['generator_params']['n_trials']}_{abbreviations}.png") os.makedirs(self.output_path, exist_ok=True) fig1.savefig(distance_plot_path) print(f"Saved objectives vs. genEL features plot in {distance_plot_path}") # fig2.suptitle('Meshgrid Comparison', fontsize=12) meshgrid_plot_path = os.path.join(self.output_path, f"meshgrid_genEL{len(self.gen)}_objectives{len(objectives_keys)}_trials{self.model_params['generator_params']['n_trials']}_{abbreviations}.png") fig2.savefig(meshgrid_plot_path) print(f"Saved meshgrid plot in {meshgrid_plot_path}") def plot_dist_mx (self, model_params): gen_dict = defaultdict(lambda: defaultdict(dict)) targets_dict = defaultdict(lambda: defaultdict(dict)) set_ = set() for in_file in self.input_path: for file in glob.glob(f'{in_file}*.csv'): read_in = pd.read_csv(file) feat1, feat2 = None, None if len(read_in.columns) == 2: feat1 = read_in.columns[0] feat2 = feat1 else: feat1 = read_in.columns[0] feat2 = read_in.columns[1] read_in['fn'] = file gen_dict[feat1][feat2] = read_in set_.add(feat1) set_.add(feat2) for target_file in model_params["targets"]: for file in glob.glob(f'{target_file}*.csv'): read_in = pd.read_csv(file) if 'task' in read_in.columns: read_in.rename(columns={"task":"log"}, inplace=True) feat1, feat2 = None, None if len(read_in.columns) == 2: feat1 = read_in.columns[1] feat2 = feat1 else: feat1 = read_in.columns[1] feat2 = read_in.columns[2] read_in['fn'] = file targets_dict[feat1][feat2] = read_in set_.add(feat1) set_.add(feat2) keys = sorted(list(set_)) result_df = pd.DataFrame(index=keys, columns=keys) dist_list = list() for gen_idx, (gen_obj1_key, gen_obj1_vals) in enumerate(gen_dict.items()): if gen_obj1_key not in targets_dict: continue for gen_obj1_value in gen_obj1_vals: if gen_obj1_value not in targets_dict[gen_obj1_key]: continue gen_df = gen_dict[gen_obj1_key][gen_obj1_value] target_df = targets_dict[gen_obj1_key][gen_obj1_value] cnt = 0 cum_sum = 0 for i in gen_df.index: current_log_name = gen_df.loc[i, 'log'] if current_log_name in target_df['log'].values: target_entry = target_df[target_df['log'] == current_log_name] else: print (f"[INFO] no value found for {current_log_name} in target file") vec1 = np.array([gen_df[gen_obj1_key][i], gen_df[gen_obj1_value][i]]) vec2 = np.array([target_entry[gen_obj1_key].values[0], target_entry[gen_obj1_value].values[0]]) cum_sum += np.linalg.norm(vec1 - vec2) cnt += 1 THRESHOLD=0.1 if np.linalg.norm(vec1 - vec2) < THRESHOLD and len(gen_df.columns)>3:#3 for 1 objective path_splits = gen_df.loc[i, 'fn'].split("/") data_splits = path_splits[-1][:-4].split("_") log_path= f'grid_2objectives_{data_splits[1]}_{data_splits[2]}/2_{data_splits[1]}_{data_splits[2]}/genEL{current_log_name}_*.xes' dest, len_is = select_instance(in_file.replace("features/", ""), log_path) dist_list.append(np.linalg.norm(vec1 - vec2)) cum_sum /= cnt result_df.loc[gen_obj1_key, gen_obj1_value] = cum_sum result_df.loc[gen_obj1_value, gen_obj1_key] = cum_sum try: print(f"INFO: Instance selection saved {len_is} ED selection in {dest}") except UnboundLocalError as e: print(e) ratio_most_common_variant = 2.021278 / 11.0 ratio_top_10_variants = 0.07378 / 11.0 ratio_variants_per_number_of_traces = 0.016658 / 11.0 result_df['ratio_most_common_variant']['ratio_most_common_variant'] = ratio_most_common_variant result_df['ratio_top_10_variants']['ratio_top_10_variants'] = ratio_top_10_variants result_df['ratio_variants_per_number_of_traces']['ratio_variants_per_number_of_traces'] = ratio_variants_per_number_of_traces abbrvs_key = get_keys_abbreviation(keys) result_df.columns = abbrvs_key.split("_") result_df.index = abbrvs_key.split("_") # result__mx = result_df.values.astype(np.float16) # result__mx[np.isnan(result__mx)] = 0 img = sns.heatmap(result_df.astype(np.float16),annot=True, cmap="viridis_r", vmin=0.0, vmax=1.0) # plt.xticks(rotation=45) plt.yticks(rotation=0) plt.tight_layout() plt.savefig(os.path.join(self.output_path, f"dist_mx_{abbrvs_key}")) plt.show() fig = plt.figure() sns.histplot(data=pd.DataFrame(dist_list), x=0, bins=30) fig.savefig(os.path.join(self.output_path, f"dist_histogram"))