igedi / gedi /plotter.py
Andrea Maldonado
Deactivates Evaluation plotter
e1801b6
import matplotlib as mpl
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import os
import glob
from collections import defaultdict
from gedi.generator import get_tasks
from gedi.utils.io_helpers import get_keys_abbreviation
from gedi.utils.io_helpers import read_csvs, select_instance
from gedi.utils.param_keys import PLOT_TYPE, PROJECTION, EXPLAINED_VAR, PLOT_3D_MAP
from gedi.utils.param_keys import OUTPUT_PATH, PIPELINE_STEP
from gedi.utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, PLOT_REFERENCE_FEATURE
from gedi.utils.param_keys.plotter import REAL_EVENTLOG_PATH, FONT_SIZE, BOXPLOT_WIDTH
from matplotlib.axes import Axes
from matplotlib.figure import Figure
from matplotlib.lines import Line2D
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.decomposition import PCA
def insert_newlines(string, every=140):
return '\n'.join(string[i:i+every] for i in range(0, len(string), every))
class MyPlotter:
def __init__(self, interactive: bool = True, title_prefix: str = '', for_paper: bool = False):
self.fig: Figure = Figure()
self.axes: Axes = Axes(self.fig, [0, 0, 0, 0])
self.interactive: bool = interactive
self.title_prefix: str = title_prefix
self.colors: dict = mcolors.TABLEAU_COLORS
self.for_paper: bool = for_paper
if self.interactive:
mpl.use('TkAgg')
if self.for_paper:
self.fontsize = 18
else:
self.fontsize = 10
def _set_figure_title(self):
self.fig.suptitle(self.title_prefix)
def _post_processing(self):
if not self.for_paper:
self._set_figure_title()
plt.show()
class ModelResultPlotter(MyPlotter):
def plot_models(self, model_results, plot_type='', plot_tics=False, components=None):
"""
Plots the model results in 2d-coordinate system next to each other.
Alternatively with tics of the components can be plotted under the figures when `plot_tics` is True
:param model_results: list of dictionary
dict should contain the keys: 'model', 'projection', 'title_prefix' (optional)
:param plot_type: param_key.plot_type
:param plot_tics: bool (default: False)
Plots the component tics under the base figures if True
:param components: int
Number of components used for the reduced
"""
if plot_tics:
self.fig, self.axes = plt.subplots(components + 1, len(model_results),
constrained_layout=True, figsize=(10,8)) # subplots(rows, columns)
main_axes = self.axes[0] # axes[row][column]
if len(model_results) == 1:
for component_nr in range(components + 1)[1:]:
self._plot_time_tics(self.axes[component_nr], model_results[DUMMY_ZERO][PROJECTION],
component=component_nr)
else:
for i, result in enumerate(model_results):
df_pca = pd.DataFrame(result[PROJECTION], columns=["PC1", "PC2"])
sns.scatterplot(ax=self.axes[0][i], data=df_pca, x="PC1", y="PC2", palette="bright", hue=['']*len(df_pca), alpha=0.9, s=100)
try:
self.axes[0][i].set_xlabel(f"PC1 ({np.round(result[EXPLAINED_VAR][0]*100, 2)}% explained variance)")
self.axes[0][i].set_ylabel(f"PC2 ({np.round(result[EXPLAINED_VAR][1]*100, 2)}% explained variance)")
except TypeError:
self.axes[0][i].set_xlabel(f"TSNE_1")
self.axes[0][i].set_ylabel(f"TSNE_2")
for component_nr in range(components + 1)[1:]:
self._plot_time_tics(self.axes[component_nr][i], result[PROJECTION], component=component_nr)
else:
self.fig, self.axes = plt.subplots(1, len(model_results), constrained_layout=True)
main_axes = self.axes
plt.show()
@staticmethod
def _plot_time_tics(ax, projection, component):
"""
Plot the time tics on a specific axis
:param ax: axis
:param projection:
:param component:
:return:
"""
ax.cla()
ax.set_xlabel('Time step')
ax.set_ylabel('Component {}'.format(component))
ax.label_outer()
ax.plot(projection[:, component - 1])
class ArrayPlotter(MyPlotter):
def __init__(self, interactive=False, title_prefix='', x_label='', y_label='', bottom_text=None, y_range=None,
show_grid=False, xtick_start=0, for_paper=False):
super().__init__(interactive, title_prefix, for_paper)
self.x_label = x_label
self.y_label = y_label
self.bottom_text = bottom_text
self.range_tuple = y_range
self._activate_legend = False
self.show_grid = show_grid
self.xtick_start = xtick_start
def _post_processing(self, legend_outside=False):
# self.axes.set_title(self.title_prefix)
self.axes.set_xlabel(self.x_label, fontsize=self.fontsize)
self.axes.set_ylabel(self.y_label, fontsize=self.fontsize)
# plt.xticks(fontsize=self.fontsize)
# plt.yticks(fontsize=self.fontsize)
if self.bottom_text is not None:
self.fig.text(0.01, 0.01, self.bottom_text, fontsize=self.fontsize)
self.fig.tight_layout()
self.fig.subplots_adjust(bottom=(self.bottom_text.count('\n') + 1) * 0.1)
else:
self.fig.tight_layout()
if legend_outside:
self.axes.legend(bbox_to_anchor=(0.5, -0.05), loc='upper center', fontsize=8)
plt.subplots_adjust(bottom=0.25)
elif self._activate_legend:
self.axes.legend(fontsize=self.fontsize)
if self.range_tuple is not None:
self.axes.set_ylim(self.range_tuple)
if self.show_grid:
plt.grid(True, which='both')
plt.minorticks_on()
super()._post_processing()
def matrix_plot(self, matrix, as_surface='2d', show_values=False):
"""
Plots the values of a matrix on a 2d or a 3d axes
:param matrix: ndarray (2-ndim)
matrix, which should be plotted
:param as_surface: str
Plot as a 3d-surface if value PLOT_3D_MAP else 2d-axes
:param show_values: If true, then show the values in the matrix
"""
c_map = plt.cm.viridis
# c_map = plt.cm.seismic
if as_surface == PLOT_3D_MAP:
x_coordinates = np.arange(matrix.shape[0])
y_coordinates = np.arange(matrix.shape[1])
x_coordinates, y_coordinates = np.meshgrid(x_coordinates, y_coordinates)
self.fig = plt.figure()
self.axes = self.fig.gca(projection='3d')
self.axes.set_zlabel('Covariance Values', fontsize=self.fontsize)
im = self.axes.plot_surface(x_coordinates, y_coordinates, matrix, cmap=c_map)
else:
self.fig, self.axes = plt.subplots(1, 1, dpi=80)
im = self.axes.matshow(matrix, cmap=c_map)
if show_values:
for (i, j), value in np.ndenumerate(matrix):
self.axes.text(j, i, '{:0.2f}'.format(value), ha='center', va='center', fontsize=8)
if not self.for_paper:
self.fig.colorbar(im, ax=self.axes)
plt.xticks(np.arange(matrix.shape[1]), np.arange(self.xtick_start, matrix.shape[1] + self.xtick_start))
# plt.xticks(np.arange(matrix.shape[1], step=5),
# np.arange(self.xtick_start, matrix.shape[1] + self.xtick_start, step=5))
self._post_processing()
def plot_gauss2d(self,
x_index: np.ndarray,
ydata: np.ndarray,
new_ydata: np.ndarray,
gauss_fitted: np.ndarray,
fit_method: str,
statistical_function: callable = np.median):
"""
Plot the original data (ydata), the new data (new_ydata) where the x-axis-indices is given by (x_index),
the (fitted) gauss curve and a line (mean, median)
:param x_index: ndarray (1-ndim)
range of plotting
:param ydata: ndarray (1-ndim)
original data
:param new_ydata: ndarray (1-ndim)
the changed new data
:param gauss_fitted: ndarray (1-ndim)
the fitted curve on the new data
:param fit_method: str
the name of the fitting method
:param statistical_function: callable
Some statistical numpy function
:return:
"""
self.fig, self.axes = plt.subplots(1, 1, dpi=80)
self.axes.plot(x_index, gauss_fitted, '-', label=f'fit {fit_method}')
# self.axes.plot(x_index, gauss_fitted, ' ')
self.axes.plot(x_index, ydata, '.', label='original data')
# self.axes.plot(x_index, ydata, ' ')
statistical_value = np.full(x_index.shape, statistical_function(ydata))
if self.for_paper:
function_label = 'threshold'
else:
function_label = function_name(statistical_function)
self._activate_legend = True
self.axes.plot(x_index, statistical_value, '-', label=function_label)
# self.axes.plot(x_index, statistical_value, ' ')
# self.axes.plot(x_index, new_ydata, '.', label='re-scaled data')
self.axes.plot(x_index, new_ydata, ' ')
self._post_processing()
def plot_2d(self, ndarray_data, statistical_func=None):
self.fig, self.axes = plt.subplots(1, 1)
self.axes.plot(ndarray_data, '-')
if statistical_func is not None:
statistical_value = statistical_func(ndarray_data)
statistical_value_line = np.full(ndarray_data.shape, statistical_value)
self.axes.plot(statistical_value_line, '-',
label=f'{function_name(statistical_func)}: {statistical_value:.4f}')
self._activate_legend = False
self._post_processing()
def plot_merged_2ds(self, ndarray_dict: dict, statistical_func=None):
self.fig, self.axes = plt.subplots(1, 1, dpi=80)
self.title_prefix += f'with {function_name(statistical_func)}' if statistical_func is not None else ''
for key, ndarray_data in ndarray_dict.items():
# noinspection PyProtectedMember
color = next(self.axes._get_lines.prop_cycler)['color']
if statistical_func is not None:
if isinstance(ndarray_data, list):
ndarray_data = np.asarray(ndarray_data)
self.axes.plot(ndarray_data, '-', color=color)
statistical_value = statistical_func(ndarray_data)
statistical_value_line = np.full(ndarray_data.shape, statistical_value)
self.axes.plot(statistical_value_line, '--',
label=f'{key.strip()}: {statistical_value:.4f}', color=color)
else:
self.axes.plot(ndarray_data, '-', color=color, label=f'{key.strip()[:35]}')
self._activate_legend = True
self._post_processing()
class BenchmarkPlotter:
def __init__(self, benchmark_results, output_path = None):
self.plot_miners_correlation(benchmark_results, output_path=output_path)
self.plot_miner_feat_correlation(benchmark_results, output_path=output_path)
self.plot_miner_feat_correlation(benchmark_results, mean='methods', output_path=output_path)
def plot_miner_feat_correlation(self, benchmark, mean='metrics', output_path=None):
df = benchmark.loc[:, benchmark.columns!='log']
corr = df.corr()
if mean == 'methods':
for method in ['inductive', 'heu', 'ilp']:
method_cols = [col for col in corr.columns if col.startswith(method)]
corr[method+'_avg'] = corr.loc[:, corr.columns.isin(method_cols)].mean(axis=1)
elif mean == 'metrics':
for metric in ['fitness', 'precision', 'generalization', 'simplicity']:
metric_cols = [col for col in corr.columns if col.endswith(metric)]
corr[metric+'_avg'] = corr.loc[:, corr.columns.isin(metric_cols)].mean(axis=1)
avg_cols = [col for col in corr.columns if col.endswith('_avg')]
benchmark_result_cols = [col for col in corr.columns if col.startswith('inductive')
or col.startswith('heu') or col.startswith('ilp')]
corr = corr[:][~corr.index.isin(benchmark_result_cols)]
fig, axes = plt.subplots( 1, len(avg_cols), figsize=(15,10))
for i, ax in enumerate(axes):
cbar = True if i==3 else False
corr = corr.sort_values(avg_cols[i], axis=0, ascending=False)
b= sns.heatmap(corr[[avg_cols[i]]][:],
ax=ax,
xticklabels=[avg_cols[i]],
yticklabels=corr.index,
cbar=cbar)
plt.subplots_adjust(wspace = 1, top=0.9, left=0.15)
fig.suptitle(f"Feature and performance correlation per {mean.split('s')[0]} for {len(benchmark)} event-logs")
if output_path != None:
output_path = output_path+f"/minperf_corr_{mean.split('s')[0]}_el{len(benchmark)}.jpg"
fig.savefig(output_path)
print(f"SUCCESS: Saved correlation plot at {output_path}")
#plt.show()
def plot_miners_correlation(self, benchmark, output_path=None):
benchmark_result_cols = [col for col in benchmark.columns if col.startswith('inductive')
or col.startswith('heu') or col.startswith('ilp')]
df = benchmark.loc[:, benchmark.columns!='log']
df = df.loc[:, df.columns.isin(benchmark_result_cols)]
corr = df.corr()
fig, ax = plt.subplots(figsize=(15,10))
b= sns.heatmap(corr,
ax=ax,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
plt.title(f"Miners and performance correlation for {len(benchmark)} event-logs", loc='center')
if output_path != None:
output_path = output_path+f"/minperf_corr_el{len(benchmark)}.jpg"
fig.savefig(output_path)
print(f"SUCCESS: Saved correlation plot at {output_path}")
#plt.show()
class FeaturesPlotter:
def __init__(self, features, params=None):
output_path = params[OUTPUT_PATH] if OUTPUT_PATH in params else None
plot_type = f", plot_type='{params[PLOT_TYPE]}'" if params.get(PLOT_TYPE) else ""
font_size = f", font_size='{params[FONT_SIZE]}'" if params.get(FONT_SIZE) else ""
boxplot_w = f", boxplot_w='{params[BOXPLOT_WIDTH]}'" if params.get(BOXPLOT_WIDTH) else ""
LEGEND = ", legend=True" if params.get(PIPELINE_STEP) else ""
source_name = os.path.split(params['input_path'])[-1].replace(".csv", "")+"_"
#output_path = os.path.join(output_path, source_name)
if REAL_EVENTLOG_PATH in params:
real_eventlogs_path=params[REAL_EVENTLOG_PATH]
real_eventlogs = pd.read_csv(real_eventlogs_path)
fig, output_path = eval(f"self.plot_violinplot_multi(features, output_path, real_eventlogs, source='{source_name}' {plot_type}{font_size}{boxplot_w}{LEGEND})")
else:
fig, output_path = eval(f"self.plot_violinplot_single(features, output_path, source='{source_name}' {plot_type}{font_size}{boxplot_w})")
if output_path != None:
os.makedirs(os.path.split(output_path)[0], exist_ok=True)
fig.savefig(output_path)
print(f"SUCCESS: Saved {plot_type} plot in {output_path}")
def plot_violinplot_single(self, features, output_path=None, source="_", plot_type="violinplot", font_size=16, boxplot_w=16):
columns = features.columns[1:]
df1=features.select_dtypes(exclude=['object'])
fig, axes = plt.subplots(len(df1.columns),1, figsize=(int(boxplot_w),len(df1.columns)))
for i, ax in enumerate(axes):
eval(f"sns.{plot_type}(data=df1, x=df1[df1.columns[i]], ax=ax)")
fig.suptitle(f"{len(columns)} features distribution for {len(features)} generated event-logs", fontsize=font_size, y=1)
fig.tight_layout()
output_path=output_path+f"/{plot_type}s_{source}{len(columns)}fts_{len(df1)}gEL.jpg"
return fig, output_path
def plot_violinplot_multi(self, features, output_path, real_eventlogs, source="_", plot_type="violinplot",
font_size=24, legend=False, boxplot_w=16):
LOG_NATURE = "Log Nature"
GENERATED = "Generated"
REAL = "Real"
FONT_SIZE=font_size
alpha = 0.7
color = sns.color_palette("bright")
markers = ['o','X']
inner_param = ''
features[LOG_NATURE] = GENERATED
real_eventlogs[LOG_NATURE] = REAL
bdf = pd.concat([features, real_eventlogs])
bdf = bdf[features.columns]
bdf = bdf.dropna(axis='rows')
columns = bdf.columns[3:]
dmf1=bdf.select_dtypes(exclude=['object'])
if plot_type == 'violinplot':
inner_param = 'inner = None,'
fig, axes = plt.subplots(len(dmf1.columns),1, figsize=(int(boxplot_w),len(dmf1.columns)*1.25), dpi=300)
if isinstance(axes, Axes): # not isinstance(axes, list):
axes = [axes]
#nature_types = set(['Generated', 'Real'])#set(bdf['Log Nature'].unique())
nature_types = list(reversed(bdf['Log Nature'].unique()[:2]))
for i, ax in enumerate(axes):
for j, nature in enumerate(nature_types):
eval(f"sns.{plot_type}(data=bdf[bdf['Log Nature']==nature], x=dmf1.columns[i], palette=[color[j]], {inner_param} ax=ax)")
eval(f"sns.stripplot(data=bdf[bdf['Log Nature']==nature], x=dmf1.columns[i], palette=[color[j]], marker=markers[j], {inner_param} ax=ax)")
for collection in ax.collections:
collection.set_alpha(alpha)
for patch in ax.patches:
r, g, b, a = patch.get_facecolor()
patch.set_facecolor((r, g, b, alpha))
custom_lines = [
Line2D([0], [0], color=color[nature], lw=4, alpha=alpha)
for nature in [0,1,2]
]
#ax.legend(custom_lines, bdf['Log Nature'].unique(), title= "Log Nature")
#sns.set_context("paper", font_scale=1.5)
ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
ax.tick_params(axis='both', which='minor', labelsize=FONT_SIZE)
ax.set_xlabel(dmf1.columns[i], fontsize=FONT_SIZE)
if legend:
fig.legend(custom_lines, nature_types, loc='upper right', ncol=len(nature_types), prop={'size': FONT_SIZE})
plt.legend(fontsize=FONT_SIZE)
#fig.suptitle(f"{len(features.columns)-2} features distribution for {len(real_eventlogs[real_eventlogs['Log Nature'].isin(nature_types)])} real and {len(features)} generated event-logs", fontsize=16, y=1)
plt.yticks(fontsize=FONT_SIZE)
plt.xticks(fontsize=FONT_SIZE)
fig.tight_layout()
output_path = output_path+f"/{plot_type}s_{source}{len(columns)}fts_{len(features)}gEL_of{len(bdf[bdf['Log Nature'].isin(nature_types)])}.jpg"
return fig, output_path
class AugmentationPlotter(object):
"""Plotter for the augmented features.
If just 2 features are examined, the plotter outputs a scatterplot with the two features defining
the dimensions.
IF more than 2 features are examined, a PCA is performed first before the first two principal
components are plotted.
Parameters
----------
features : pd.DataFrame
dataFrame containing the information of the real and synthesized datasets.
"""
def __init__(self, features, params=None) -> None:
output_path = params[OUTPUT_PATH] if OUTPUT_PATH in params else None
self.sampler = params['augmentation_params']['method']
eval(f"self.plot_augmented_features(features, output_path)")
def plot_augmented_features(self, features, output_path=None) -> None:
"""Plotting for augmented features. When more than 2 features are selected, the
plot will show the result after applying a PCA; otherwise the 2 features are
plotted according to the values.
Parameters
----------
features : pd.DataFrame
DataFrame containing the augmented features
output_path : str, optional
Path to the output file, by default None
"""
if len(features.all.columns) < 2:
raise AssertionError ("AugmentationPlotter - More than 2 (augmented) features are expected for plotting.")
if len(features.all.columns) > 2:
self._plot_pca(features, output_path)
else:
self._plot_2d(features, output_path)
def _plot_2d(self, features, output_path=None) -> None:
"""Fnc for plotting 2D features without any dimension reduction technique being applied.
Parameters
----------
features : pd.DataFrame
Dataframe containing the augmented features
output_path : str, optional
Path to the output file, by default None
"""
col1_name, col2_name = features.all.columns
# INIT - settings
X = features.all.iloc[:-features.new_samples.shape[0]]
X = X.to_numpy()
X_aug = features.all.to_numpy()
sns.set_theme()
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(24, 8))
fig.suptitle(f'Log Descriptors - real: {X.shape[0]}, synth.: {X_aug.shape[0]-X.shape[0]}', fontsize=16)
# Normalizer: applied to each observation -> row values have unit norm
normalizer = Normalizer(norm="l2").fit(X)
normed_data = normalizer.transform(X_aug)
# StandardScaler: applied to features -> col values have unit norm
scaler = StandardScaler().fit(X)
scaled_data = scaler.transform(X_aug)
# PLOT - raw 2d data
X_aug = self._add_real_synth_encoding(X_aug, X, X_aug)
df_raw = self._convert_to_df(X_aug, [col1_name, col2_name, 'type'])
sns.scatterplot(ax=ax1, data=df_raw, x=col1_name, y=col2_name, palette="bright",
hue = "type", alpha=0.5, s=100).set_title("Raw data")
ax1.get_legend().set_title("")
# PLOT - normed 2d data
normed_data = self._add_real_synth_encoding(normed_data, X, X_aug)
df_normed = self._convert_to_df(normed_data, [col1_name, col2_name, 'type'])
sns.scatterplot(ax=ax2, data=df_normed, x=col1_name, y=col2_name, palette="bright",
hue = 'type', alpha=0.5, s=100).set_title("Normalized data")
ax2.get_legend().set_title("")
# PLOT - scaled 2d data
scaled_data = self._add_real_synth_encoding(scaled_data, X, X_aug)
df_scaled = self._convert_to_df(scaled_data, [col1_name, col2_name, 'type'])
sns.scatterplot(ax=ax3, data=df_scaled, x=col1_name, y=col2_name, palette="bright",
hue = 'type', alpha=0.5, s=100).set_title("Scaled data")
ax3.get_legend().set_title("")
plt.tight_layout()
# OUTPUT
if output_path != None:
output_path += f"/augmentation_2d_plot_{col1_name}-{col2_name}_{self.sampler}.jpg"
fig.savefig(output_path)
print(f"SUCCESS: Saved augmentation pca plot at {output_path}")
def _add_real_synth_encoding(self, arr, X, X_aug) -> np.array:
"""Helper function for adding one additional column to the array in the last column.
The last column indicates whether it is a real data (=0) or synthesized (=1).
Parameters
----------
arr : np.array
data array
X : np.array
data of real datasets
X_aug : np.array
data of real datasets and synthesized datasets
Returns
-------
np.array
array containing the data with an additional last column indicating whether the
data comes from a real dataset or synthesized one
"""
real_synth_enc = np.array([0]*X.shape[0] + [1]*(X_aug.shape[0]-X.shape[0])).reshape(-1, 1)
return np.hstack ([arr, real_synth_enc])
def _convert_to_df(self, arr, colnames, enc=['real', 'synth']) -> pd.DataFrame:
"""Converts the attached array to a dataframe. The column names are
defined by the respective parameters, where the last column is encoded
by the string array of the enc parameter.
Parameters
----------
arr : np.array
_description_
colnames : list
column names of returned dataframe
enc : list, optional
labels for real vs. generated data, by default ['real', 'synth']
Returns
-------
pd.DataFrame
dataframe containing the attached data array with encoded values in the last column
"""
df = pd.DataFrame(arr, columns=colnames)
df.loc[df.iloc[:, -1] == 0, colnames[-1]] = enc[0]
df.loc[df.iloc[:, -1] == 1, colnames[-1]] = enc[1]
return df
def _plot_pca(self, features, output_path=None) -> None:
"""Fnc for plotting features with PCA as dimension reduction technique being applied.
Parameters
----------
features : pd.DataFrame
DataFrame containing the augmented features
output_path : str, optional
path to output file, by default None
"""
# INIT - settings
n_features = features.all.shape[1]
X = features.all.iloc[:-features.new_samples.shape[0]]
X = X.to_numpy()
X_aug = features.all.to_numpy()
sns.set_theme()
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(24, 8))
fig.suptitle(f'Log Descriptors - real: {X.shape[0]}, synth.: {X_aug.shape[0]-X.shape[0]}', fontsize=16)
pca_components = 2
pca = PCA(n_components=pca_components)
# Normalizer: applied to each observation -> row values have unit norm
normalizer = Normalizer(norm="l2").fit(X)
normed_data_real = normalizer.transform(X)
normed_data_aug = normalizer.transform(X_aug)
# StandardScaler: applied to features -> col values have unit norm
scaler = StandardScaler().fit(X)
scaled_data_real = scaler.transform(X)
scaled_data_aug = scaler.transform(X_aug)
# PLOT - PCA on raw input
fit_pca = pca.fit(X)
X_new = fit_pca.transform(X_aug)
X_new = self._add_real_synth_encoding(X_new[:, :pca_components], X, X_aug)
df_pca = self._convert_to_df(X_new, ['PC_1', 'PC_2', 'type'])
sns.scatterplot(ax=ax1, data=df_pca, x="PC_1", y="PC_2", palette="bright", hue = 'type', alpha=0.5, s=100)
ax1.set_xlabel(f"PC1 ({np.round(pca.explained_variance_ratio_[0]*100, 2)}% explained variance)")
ax1.set_ylabel(f"PC2 ({np.round(pca.explained_variance_ratio_[1]*100, 2)}% explained variance)")
ax1.get_legend().set_title("")
# PLOT - PCA on normed data
fit_norm_pca = pca.fit(normed_data_real)
X_new_normed = fit_norm_pca.transform(normed_data_aug)
X_new_normed = self._add_real_synth_encoding(X_new_normed[:, :pca_components], X, X_aug)
df_pca_normed = self._convert_to_df(X_new_normed, ['PC_1', 'PC_2', 'type'])
sns.scatterplot(ax=ax2, data=df_pca_normed, x="PC_1", y="PC_2", palette="bright", hue = 'type', alpha=0.5, s=100)
ax2.set_xlabel(f"PC1 ({np.round(pca.explained_variance_ratio_[0]*100, 2)}% explained variance)")
ax2.set_ylabel(f"PC2 ({np.round(pca.explained_variance_ratio_[1]*100, 2)}% explained variance)")
ax2.get_legend().set_title("")
# PLOT - PCA on scaled data
fit_sca_pca = pca.fit(scaled_data_real)
X_new_sca = fit_sca_pca.transform(scaled_data_aug)
X_new_sca = self._add_real_synth_encoding(X_new_sca[:, :pca_components], X, X_aug)
df_pca_scaled = self._convert_to_df(X_new_sca, ['PC_1', 'PC_2', 'type'])
sns.scatterplot(ax=ax3, data=df_pca_scaled, x="PC_1", y="PC_2", palette="bright", hue = 'type', alpha=0.5, s=100)
ax3.set_xlabel(f"PC1 ({np.round(pca.explained_variance_ratio_[0]*100, 2)}% explained variance)")
ax3.set_ylabel(f"PC2 ({np.round(pca.explained_variance_ratio_[1]*100, 2)}% explained variance)")
ax3.get_legend().set_title("")
plt.tight_layout()
# OUTPUT
if output_path != None:
output_path += f"/augmentation_pca_{n_features}_{self.sampler}.jpg"
fig.savefig(output_path)
print(f"SUCCESS: Saved augmentation pca plot at {output_path}")
class GenerationPlotter(object):
def __init__(self, gen_cfg, model_params, output_path, input_path=None):
print(f"Running plotter for {len(gen_cfg)} genEL, params {model_params}, output path: {output_path}")
self.output_path = output_path
self.input_path = input_path
self.model_params = model_params
if gen_cfg.empty: # Deactivated for tests
return
if "metafeatures" in gen_cfg.columns:
self.gen = gen_cfg.metafeatures
self.gen=pd.concat([pd.DataFrame.from_dict(entry, orient="Index").T for entry in self.gen]).reset_index(drop=True)
else:
self.gen = gen_cfg.reset_index(drop=True)
if GENERATOR_PARAMS in model_params:
self.tasks, _ = get_tasks(model_params[GENERATOR_PARAMS][EXPERIMENT])
feature_list = list(self.tasks.select_dtypes(exclude=['object']).keys())
ref_feat = None
if PLOT_REFERENCE_FEATURE in model_params[GENERATOR_PARAMS]and model_params[GENERATOR_PARAMS][PLOT_REFERENCE_FEATURE] != "":
ref_feat = model_params[GENERATOR_PARAMS][PLOT_REFERENCE_FEATURE]
reference_feature_list = feature_list if ref_feat is None else [ref_feat]
self.plot_settings()
if input_path is not None:
# plot single reference feature compared to values stored in .csvs
if isinstance(input_path, str) and input_path.endswith(".csv"):
f_d = pd.read_csv(input_path)
f_d = {model_params['reference_feature']: f_d}
elif isinstance(input_path, list):
self.plot_dist_mx(model_params)
else:
f_d = read_csvs(input_path, model_params['reference_feature'])
tasks, _ = get_tasks(model_params['targets'], reference_feature=model_params['reference_feature'])
self.plot_reference_feature_plot(tasks, f_d, model_params['reference_feature'])
else:
# start all plotting procedures at once
self.plot_feat_comparison(feature_list, reference_feature_list)
def plot_reference_feature_plot(self, orig_targets, f_dict, reference_feature, resolution=10):
fig1, axes = plt.subplots(1, len(f_dict), figsize=(20, 4))
if isinstance(axes,Axes):
axes = [axes]
fig2, axes_mesh = plt.subplots(1, len(f_dict), figsize=(20, 4), layout='compressed')
if isinstance(axes_mesh, Axes):
axes_mesh = [axes_mesh]
for idx_ax, (k, v) in enumerate(f_dict.items()):
if isinstance(orig_targets, pd.DataFrame):
targets = orig_targets.copy()
elif isinstance(orig_targets, defaultdict):
if k not in orig_targets:
print(f"[WARNING] {k} not in targets. Only in generated features. Will continue with next feature to compare with")
continue
targets = orig_targets[k].copy()
else:
print(f"[ERR] Unknown file format for targets {type(orig_targets)}. Close program (Exit Code: 0).")
# Identify NAN values of reference feature
target_nan_values_idx_reference = np.where(targets[reference_feature].isna())[0]
target_nan_logs_reference = targets.loc[target_nan_values_idx_reference]['log']
# Identify NAN values of competitor feature
target_nan_values_idx_competitor = np.where(targets[k].isna())[0]
target_nan_logs_competitor = targets.loc[target_nan_values_idx_competitor]['log']
# Collection of indices to drop
target_nan_indices = np.unique(np.concatenate((target_nan_values_idx_competitor, target_nan_values_idx_reference)))
# Drop NAN values in target DataFrame
targets.drop(axis='index', index=target_nan_indices, inplace=True)
# Check for indices in generated DataFrame
reference_values_idx_reference = v[v['log'].isin(list(target_nan_logs_reference))].index
reference_values_idx_competitor = v[v['log'].isin(list(target_nan_logs_competitor))].index
# Collection of indices to drop for reference
reference_nan_indices = np.unique(np.concatenate((reference_values_idx_reference, reference_values_idx_competitor)))
# Drop NAN values in generated DataFrame
v.drop(axis='index', index=reference_nan_indices, inplace=True)
# Plot generated DataFrame + target DataFrame
v.plot.scatter(x=v.columns.get_loc(reference_feature), y=v.columns.get_loc(k), ax=axes[idx_ax], c="red", alpha=0.3)
targets.plot.scatter(x=targets.columns.get_loc(reference_feature), y=targets.columns.get_loc(k), ax=axes[idx_ax], c='blue', alpha=0.3)
Z = np.zeros([resolution+1, resolution+1])
cnt_Z = np.zeros([resolution+1, resolution+1])
Z.fill(np.nan)
min_Z_X = np.min(targets[reference_feature])
min_Z_Y = np.min(targets[k])
max_Z_X = np.max(targets[reference_feature])
max_Z_Y = np.max(targets[k])
step_Z_X = np.round((max_Z_X - min_Z_X) / float(resolution), 4)
step_Z_Y = np.round((max_Z_Y - min_Z_Y) / float(resolution), 4)
cum_sum=0
for idx in v.index:
if isinstance(v, pd.DataFrame) and 'log' in v.columns:
c_log = v.loc[idx, 'log']
if c_log in targets['log'].values:
gen_entry = targets[targets['log'] == c_log]
else:
print(f"INFO: no value for {c_log} in generated files.")
gen_entry = targets
else:
gen_entry = targets
# Plot connection line
axes[idx_ax].plot([v[reference_feature][idx], gen_entry[reference_feature].values[0]],
[v[k][idx], gen_entry[k].values[0]],
c="green", alpha=0.25)
# Plot textual annotation
axes[idx_ax].annotate(gen_entry['log'].values[0],
(gen_entry[reference_feature].values[0], gen_entry[k].values[0]),
fontsize=5)
# Compute distance between real and generated dot
vec1 = np.array([v[reference_feature][idx], v[k][idx]])
vec2 = np.array([gen_entry[reference_feature].values[0], gen_entry[k].values[0]])
Z_idx = int (np.round((gen_entry[reference_feature].values[0] - min_Z_X) / step_Z_X))
Z_idy = int (np.round((gen_entry[k].values[0] - min_Z_Y) / step_Z_Y))
if np.isnan(Z[Z_idx][Z_idy]):
Z[Z_idx][Z_idy] = 0.0
Z[Z_idx][Z_idy] += np.linalg.norm(vec1 - vec2)
cnt_Z[Z_idx][Z_idy] += 1
cum_sum += np.linalg.norm(vec1 - vec2)
print(f"INFO: Cumulated distances objectives <-> generated features for '{reference_feature}' vs. '{k}': {cum_sum:.4f}")
X, Y = np.meshgrid(np.linspace(min_Z_X, max_Z_X, resolution+1),
np.linspace(min_Z_Y, max_Z_Y, resolution+1))
cmap = plt.colormaps['viridis_r']
Z[np.isnan(Z)] = np.sqrt(2)
cnt_Z[cnt_Z==0] = 1
Z /= cnt_Z
colormesh = axes_mesh[idx_ax].pcolormesh(X, Y, Z.T, shading='nearest', cmap=cmap)
axes_mesh[idx_ax].set_xlabel(reference_feature)
axes_mesh[idx_ax].set_ylabel(k)
if idx_ax == (len(f_dict)-1):
cbar = fig2.colorbar(colormesh, ax=axes_mesh, orientation='vertical', pad=0.01)
cbar.ax.set_ylabel('Feature dist. of generated EDs and objectives',fontsize=8, rotation=90, labelpad=-50)
axes[idx_ax].set_title(f"Cumulated distances {cum_sum:.4f}")
tasks_keys = f_dict.keys()
tasks_keys = list(sorted(tasks_keys))
abbreviations = get_keys_abbreviation(tasks_keys)
ref_short_name = get_keys_abbreviation([reference_feature])
fig1_title = f'Feature Comparison - {reference_feature}'
fig1.suptitle(fig1_title, fontsize=6)
fig1.tight_layout()
distance_plot_path = os.path.join(self.output_path,
f"plot_genEL{len(self.gen)}_tasks{len(tasks_keys)}_{ref_short_name}_vs_{abbreviations}.png")
fig1.savefig(distance_plot_path)
print(f"Saved objectives vs. genEL features plot in {distance_plot_path}")
fig2.suptitle(f'Meshgrid Comparison - {reference_feature}', fontsize=6)
meshgrid_plot_path = os.path.join(self.output_path,
f"plot_meshgrid_genEL{len(self.gen)}_tasks{len(tasks_keys)}_{ref_short_name}_vs_{abbreviations}.png")
fig2.savefig(meshgrid_plot_path)
print(f"Saved meshgrid plot in {meshgrid_plot_path}")
def plot_single_comparison(self, tasks, objective1, objective2, ax, ax_cmesh, fig2, axes_meshes, flag_plt_clbar):
if len(tasks.select_dtypes(include=['object']).columns)==0:
tasks['task']=[f"task_{str(x+1)}" for x in tasks.index.values.tolist()]
id_col = tasks.select_dtypes(include=['object']).dropna(axis=1).columns[0]
tasks.plot.scatter(x=objective1, y=objective2, ax=ax, alpha=0.3)
self.gen.plot.scatter(x=objective1, y=objective2, c="red", ax=ax, alpha=0.3)
Z = np.zeros([tasks[objective1].unique().size, tasks[objective2].unique().size])
cnt_Z = np.zeros([tasks[objective1].unique().size, tasks[objective2].unique().size])
Z.fill(np.inf)
cum_sum = 0
for idx in tasks.index:
if isinstance(tasks, pd.DataFrame) and 'log' in tasks.columns:
c_log = tasks.loc[idx, 'log']
if c_log in self.gen['log'].values:
gen_entry = self.gen[self.gen['log'] == c_log]
else:
print(f"INFO: no value for {c_log} in generated files.")
gen_entry = self.gen
else:
gen_entry = self.gen
ax.plot([tasks[objective1][idx], gen_entry[objective1].values[0]],
[tasks[objective2][idx], gen_entry[objective2].values[0]],
c="green", alpha=0.25)
ax.annotate(tasks[id_col][idx], (tasks[objective1][idx], tasks[objective2][idx]), fontsize=5)
vec1 = np.array([tasks[objective1][idx], tasks[objective2][idx]])
vec2 = np.array([gen_entry[objective1].values[0], gen_entry[objective2].values[0]])
Z_idx = np.where(tasks[objective1].unique() == tasks[objective1][idx])[0][0]
Z_idy = np.where(tasks[objective2].unique() == tasks[objective2][idx])[0][0]
if np.isinf(Z[Z_idx][Z_idy]):
Z[Z_idx][Z_idy] = 0.0
Z[Z_idx][Z_idy] += np.linalg.norm(vec1 - vec2)
cnt_Z[Z_idx][Z_idy] += 1
cum_sum += np.linalg.norm(vec1 - vec2)
print(f"INFO: Cumulated distances objectives <-> generated features for '{objective1}' vs. '{objective2}':", cum_sum)
ax.set_title(f"Cumulated distances {cum_sum:.4f}")
X, Y = np.meshgrid(tasks[objective1].unique(), tasks[objective2].unique())
cmap = plt.colormaps['viridis_r']
Z[np.isinf(Z)] = np.sqrt(2)
cnt_Z[cnt_Z==0] = 1
Z /= cnt_Z
colormesh = ax_cmesh.pcolormesh(X, Y, Z.T, shading='nearest', cmap=cmap) # vmin=0.0, vmax=1.0, cmap=cmap)
ax_cmesh.set_xlabel(objective1)
ax_cmesh.set_ylabel(objective2)
if flag_plt_clbar:
fig2.colorbar(colormesh, ax=axes_meshes, orientation='vertical')
return colormesh
def plot_settings(self):
mpl.rc('axes', titlesize=8) # fontsize of the axes title
mpl.rc('axes', labelsize=8) # fontsize of the x and y labels
mpl.rc('font', size=8)
def plot_feat_comparison(self, feature_list, reference_list):
len_features = len(feature_list)
len_ref_feats = len(reference_list)
fig1, axes = plt.subplots(len_ref_feats, len_features)
fig2, axes_meshes = plt.subplots(len_ref_feats, len_features, layout='compressed')
for idx1, entry1 in enumerate(reference_list):
for idx2, entry2 in enumerate(feature_list):
if isinstance(axes, Axes):
ax = axes
ax_cmesh = axes_meshes
elif len_ref_feats == 1:
ax = axes[idx2]
ax_cmesh = axes_meshes[idx2]
else:
ax = axes[idx1][idx2]
ax_cmesh = axes_meshes[idx1][idx2]
flag_plt_clbar = False
if ((idx2 == (len(feature_list)-1)) & (idx1 == len(reference_list)-1)):
flag_plt_clbar = True
colormesh = self.plot_single_comparison(self.tasks, entry1, entry2, ax, ax_cmesh, fig2, axes_meshes, flag_plt_clbar)
objectives_keys = self.tasks.select_dtypes(exclude=['object']).columns
objectives_keys = list(sorted(objectives_keys))
abbreviations = get_keys_abbreviation(objectives_keys)
fig1_title = f'Feature Comparison with {self.model_params[GENERATOR_PARAMS]}'
fig1.suptitle(insert_newlines(fig1_title), fontsize=6)
fig1.tight_layout()
distance_plot_path = os.path.join(self.output_path,
f"eval_genEL{len(self.gen)}_objectives{len(objectives_keys)}_trials{self.model_params['generator_params']['n_trials']}_{abbreviations}.png")
os.makedirs(self.output_path, exist_ok=True)
fig1.savefig(distance_plot_path)
print(f"Saved objectives vs. genEL features plot in {distance_plot_path}")
# fig2.suptitle('Meshgrid Comparison', fontsize=12)
meshgrid_plot_path = os.path.join(self.output_path,
f"meshgrid_genEL{len(self.gen)}_objectives{len(objectives_keys)}_trials{self.model_params['generator_params']['n_trials']}_{abbreviations}.png")
fig2.savefig(meshgrid_plot_path)
print(f"Saved meshgrid plot in {meshgrid_plot_path}")
def plot_dist_mx (self, model_params):
gen_dict = defaultdict(lambda: defaultdict(dict))
targets_dict = defaultdict(lambda: defaultdict(dict))
set_ = set()
for in_file in self.input_path:
for file in glob.glob(f'{in_file}*.csv'):
read_in = pd.read_csv(file)
feat1, feat2 = None, None
if len(read_in.columns) == 2:
feat1 = read_in.columns[0]
feat2 = feat1
else:
feat1 = read_in.columns[0]
feat2 = read_in.columns[1]
read_in['fn'] = file
gen_dict[feat1][feat2] = read_in
set_.add(feat1)
set_.add(feat2)
for target_file in model_params["targets"]:
for file in glob.glob(f'{target_file}*.csv'):
read_in = pd.read_csv(file)
if 'task' in read_in.columns:
read_in.rename(columns={"task":"log"}, inplace=True)
feat1, feat2 = None, None
if len(read_in.columns) == 2:
feat1 = read_in.columns[1]
feat2 = feat1
else:
feat1 = read_in.columns[1]
feat2 = read_in.columns[2]
read_in['fn'] = file
targets_dict[feat1][feat2] = read_in
set_.add(feat1)
set_.add(feat2)
keys = sorted(list(set_))
result_df = pd.DataFrame(index=keys, columns=keys)
dist_list = list()
for gen_idx, (gen_obj1_key, gen_obj1_vals) in enumerate(gen_dict.items()):
if gen_obj1_key not in targets_dict:
continue
for gen_obj1_value in gen_obj1_vals:
if gen_obj1_value not in targets_dict[gen_obj1_key]:
continue
gen_df = gen_dict[gen_obj1_key][gen_obj1_value]
target_df = targets_dict[gen_obj1_key][gen_obj1_value]
cnt = 0
cum_sum = 0
for i in gen_df.index:
current_log_name = gen_df.loc[i, 'log']
if current_log_name in target_df['log'].values:
target_entry = target_df[target_df['log'] == current_log_name]
else:
print (f"[INFO] no value found for {current_log_name} in target file")
vec1 = np.array([gen_df[gen_obj1_key][i], gen_df[gen_obj1_value][i]])
vec2 = np.array([target_entry[gen_obj1_key].values[0], target_entry[gen_obj1_value].values[0]])
cum_sum += np.linalg.norm(vec1 - vec2)
cnt += 1
THRESHOLD=0.1
if np.linalg.norm(vec1 - vec2) < THRESHOLD and len(gen_df.columns)>3:#3 for 1 objective
path_splits = gen_df.loc[i, 'fn'].split("/")
data_splits = path_splits[-1][:-4].split("_")
log_path= f'grid_2objectives_{data_splits[1]}_{data_splits[2]}/2_{data_splits[1]}_{data_splits[2]}/genEL{current_log_name}_*.xes'
dest, len_is = select_instance(in_file.replace("features/", ""), log_path)
dist_list.append(np.linalg.norm(vec1 - vec2))
cum_sum /= cnt
result_df.loc[gen_obj1_key, gen_obj1_value] = cum_sum
result_df.loc[gen_obj1_value, gen_obj1_key] = cum_sum
try:
print(f"INFO: Instance selection saved {len_is} ED selection in {dest}")
except UnboundLocalError as e:
print(e)
ratio_most_common_variant = 2.021278 / 11.0
ratio_top_10_variants = 0.07378 / 11.0
ratio_variants_per_number_of_traces = 0.016658 / 11.0
result_df['ratio_most_common_variant']['ratio_most_common_variant'] = ratio_most_common_variant
result_df['ratio_top_10_variants']['ratio_top_10_variants'] = ratio_top_10_variants
result_df['ratio_variants_per_number_of_traces']['ratio_variants_per_number_of_traces'] = ratio_variants_per_number_of_traces
abbrvs_key = get_keys_abbreviation(keys)
result_df.columns = abbrvs_key.split("_")
result_df.index = abbrvs_key.split("_")
# result__mx = result_df.values.astype(np.float16)
# result__mx[np.isnan(result__mx)] = 0
img = sns.heatmap(result_df.astype(np.float16),annot=True, cmap="viridis_r", vmin=0.0, vmax=1.0)
# plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(self.output_path, f"dist_mx_{abbrvs_key}"))
plt.show()
fig = plt.figure()
sns.histplot(data=pd.DataFrame(dist_list), x=0, bins=30)
fig.savefig(os.path.join(self.output_path, f"dist_histogram"))