Andrea Maldonado commited on
Commit
3735e7d
·
1 Parent(s): 3c2100c

Remove unused

Browse files
config_files/algorithm/fix_24.json DELETED
@@ -1,34 +0,0 @@
1
- [
2
- {
3
- "pipeline_step": "event_logs_generation",
4
- "output_path":"data/generated",
5
- "generator_params": {
6
- "objectives": {
7
- "normalized_sequence_entropy_linear_forgetting": 0.05,
8
- "ratio_top_20_variants": 0.4
9
- },
10
- "config_space": {
11
- "mode": [5, 40],
12
- "sequence": [0.01, 1],
13
- "choice": [0.01, 1],
14
- "parallel": [0.01, 1],
15
- "loop": [0.01, 1],
16
- "silent": [0.01, 1],
17
- "lt_dependency": [0.01, 1],
18
- "num_traces": [100, 1001],
19
- "duplicate": [0],
20
- "or": [0]
21
- },
22
- "n_trials": 20
23
- }
24
- },
25
- {
26
- "pipeline_step": "feature_extraction",
27
- "input_path": "data/generated",
28
- "feature_params": {"feature_set":["simple_stats", "trace_length", "trace_variant", "activities", "start_activities", "end_activities", "entropies", "complexity"]},
29
- "feature_params": {"feature_set":["trace_length"]},
30
- "output_path": "output/plots",
31
- "real_eventlog_path": "data/log_meta_features.csv",
32
- "plot_type": "boxplot"
33
- }
34
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dashboard.py DELETED
@@ -1,295 +0,0 @@
1
- from copy import deepcopy
2
- from meta_feature_extraction.simple_stats import simple_stats
3
- from meta_feature_extraction.trace_length import trace_length
4
- from meta_feature_extraction.trace_variant import trace_variant
5
- from meta_feature_extraction.activities import activities
6
- from meta_feature_extraction.start_activities import start_activities
7
- from meta_feature_extraction.end_activities import end_activities
8
- from meta_feature_extraction.entropies import entropies
9
- from pm4py import discover_petri_net_inductive as inductive_miner
10
- from pm4py import generate_process_tree
11
- from pm4py import save_vis_petri_net, save_vis_process_tree
12
- from pm4py.algo.filtering.log.variants import variants_filter
13
- from pm4py.algo.simulation.tree_generator import algorithm as tree_generator
14
- from pm4py.algo.simulation.playout.process_tree import algorithm as playout
15
- from pm4py.objects.conversion.log import converter as log_converter
16
- from pm4py.objects.log.exporter.xes import exporter as xes_exporter
17
- from pm4py.objects.log.importer.xes import importer as xes_importer
18
- from pm4py.objects.log.util import dataframe_utils
19
- from pm4py.sim import play_out
20
-
21
- import matplotlib.image as mpimg
22
- import os
23
- import pandas as pd
24
- import streamlit as st
25
-
26
- OUTPUT_PATH = "output"
27
- SAMPLE_EVENTS = 500
28
-
29
- @st.cache(allow_output_mutation=True)
30
- def load_from_xes(uploaded_file):
31
- bytes_data = uploaded_file.getvalue()
32
- log1 = xes_importer.deserialize(bytes_data)
33
- get_stats(log1)
34
- return log1
35
-
36
- @st.cache
37
- def load_from_csv(uploaded_file, sep):
38
- if uploaded_file is not None:
39
- df = pd.read_csv(uploaded_file, sep=sep, index_col=False)
40
- return df
41
-
42
- def get_stats(log, save=True):
43
- """Returns the statistics of an event log."""
44
- num_traces = len(log)
45
- num_events = sum([len(c) for c in log])
46
- num_utraces = len(variants_filter.get_variants(log))
47
- if save:
48
- st.session_state["num_traces"] = num_traces
49
- st.session_state["num_events"] = num_events
50
- st.session_state["num_utraces"] = num_utraces
51
- return num_utraces, num_traces, num_events
52
-
53
- #@st.cache
54
- def df_to_log(df, case_id, activity, timestamp):
55
- df.rename(columns={case_id: 'case:concept:name',
56
- activity: 'concept:name',
57
- timestamp: "time:timestamp"}, inplace=True)
58
- temp = dataframe_utils.convert_timestamp_columns_in_df(df)
59
- #temp = temp.sort_values(timestamp)
60
- log = log_converter.apply(temp)
61
- return log, 'concept:name', "time:timestamp"
62
-
63
- def read_uploaded_file(uploaded_file):
64
- extension = uploaded_file.name.split('.')[-1]
65
- log_name = uploaded_file.name.split('.')[-2]
66
-
67
- st.sidebar.write("Loaded ", extension.upper(), '-File: ', uploaded_file.name)
68
- if extension == "xes":
69
- event_log = load_from_xes(uploaded_file)
70
- log_columns = [*list(event_log[0][0].keys())]
71
- convert_button = False
72
- case_id = "case:concept:name"
73
- activity = "concept:name"
74
- timestamp = "time:timestamp"
75
- default_act_id = log_columns.index("concept:name")
76
- default_tst_id = log_columns.index("time:timestamp")
77
-
78
- event_df = log_converter.apply(event_log, variant=log_converter.Variants.TO_DATA_FRAME)
79
- df_path = OUTPUT_PATH+"/"+log_name+".csv"
80
- event_df.to_csv(df_path, sep =";", index=False)
81
- return event_log, event_df, case_id, activity
82
-
83
- elif extension == "csv":
84
- sep = st.sidebar.text_input("Columns separator", ";")
85
- event_df = load_from_csv(uploaded_file, sep)
86
- old_df = deepcopy(event_df)
87
- log_columns = event_df.columns
88
-
89
- case_id = st.sidebar.selectbox("Choose 'case' column:", log_columns)
90
- activity = st.sidebar.selectbox("Choose 'activity' column:", log_columns, index=0)
91
- timestamp = st.sidebar.selectbox("Choose 'timestamp' column:", log_columns, index=0)
92
-
93
- convert_button = st.sidebar.button('Confirm selection')
94
- if convert_button:
95
- temp = deepcopy(event_df)
96
- event_log, activity, timestamp = df_to_log(temp, case_id, activity, timestamp)
97
- #xes_exporter.apply(event_log, INPUT_XES)
98
- log_columns = [*list(event_log[0][0].keys())]
99
- st.session_state['log'] = event_log
100
- return event_log, event_df, case_id, activity
101
-
102
- def sample_log_traces(complete_log, sample_size):
103
- '''
104
- Samples random traces out of logs.
105
- So that number of events is slightly over SAMPLE_SIZE.
106
- :param complete_log: Log extracted from xes
107
- '''
108
-
109
- log_traces = variants_filter.get_variants(complete_log)
110
- keys = list(log_traces.keys())
111
- sample_traces = {}
112
- num_evs = 0
113
- while num_evs < sample_size:
114
- if len(keys) == 0:
115
- break
116
- random_trace = keys.pop()
117
- sample_traces[random_trace] = log_traces[random_trace]
118
- evs = sum([len(case_id) for case_id in sample_traces[random_trace]])
119
- num_evs += evs
120
- log1 = variants_filter.apply(complete_log, sample_traces)
121
- return log1
122
-
123
- def show_process_petrinet(event_log, filter_info, OUTPUT_PATH):
124
- OUTPUT_PLOT = f"{OUTPUT_PATH}_{filter_info}".replace(":","").replace(".","")+".png" # OUTPUT_PATH is OUTPUT_PATH+INPUT_FILE
125
-
126
- try:
127
- fig_pt = mpimg.imread(OUTPUT_PLOT)
128
- st.write("Loaded from memory")
129
- except FileNotFoundError:
130
- net, im, fm = inductive_miner(event_log)
131
- # parameters={heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: 0.99,
132
- # pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"})
133
- #parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"}
134
- save_vis_petri_net(net, im, fm, OUTPUT_PLOT)
135
- st.write("Saved in: ", OUTPUT_PLOT)
136
- fig_pt = mpimg.imread(OUTPUT_PLOT)
137
- st.image(fig_pt)
138
-
139
- def show_loaded_event_log(event_log, event_df):
140
- get_stats(event_log)
141
- st.write("### Loaded event-log")
142
- col1, col2 = st.columns(2)
143
- with col2:
144
- st.dataframe(event_df)
145
- with col1:
146
- show_process_petrinet(event_log, None, OUTPUT_PATH+"running-example")
147
-
148
- def extract_meta_features(log, log_name):
149
- mtf_cols = ["log", "n_traces", "n_unique_traces", "ratio_unique_traces_per_trace", "n_events", "trace_len_min", "trace_len_max",
150
- "trace_len_mean", "trace_len_median", "trace_len_mode", "trace_len_std", "trace_len_variance", "trace_len_q1",
151
- "trace_len_q3", "trace_len_iqr", "trace_len_geometric_mean", "trace_len_geometric_std", "trace_len_harmonic_mean",
152
- "trace_len_skewness", "trace_len_kurtosis", "trace_len_coefficient_variation", "trace_len_entropy", "trace_len_hist1",
153
- "trace_len_hist2", "trace_len_hist3", "trace_len_hist4", "trace_len_hist5", "trace_len_hist6", "trace_len_hist7",
154
- "trace_len_hist8", "trace_len_hist9", "trace_len_hist10", "trace_len_skewness_hist", "trace_len_kurtosis_hist",
155
- "ratio_most_common_variant", "ratio_top_1_variants", "ratio_top_5_variants", "ratio_top_10_variants", "ratio_top_20_variants",
156
- "ratio_top_50_variants", "ratio_top_75_variants", "mean_variant_occurrence", "std_variant_occurrence", "skewness_variant_occurrence",
157
- "kurtosis_variant_occurrence", "n_unique_activities", "activities_min", "activities_max", "activities_mean", "activities_median",
158
- "activities_std", "activities_variance", "activities_q1", "activities_q3", "activities_iqr", "activities_skewness",
159
- "activities_kurtosis", "n_unique_start_activities", "start_activities_min", "start_activities_max", "start_activities_mean",
160
- "start_activities_median", "start_activities_std", "start_activities_variance", "start_activities_q1", "start_activities_q3",
161
- "start_activities_iqr", "start_activities_skewness", "start_activities_kurtosis", "n_unique_end_activities", "end_activities_min",
162
- "end_activities_max", "end_activities_mean", "end_activities_median", "end_activities_std", "end_activities_variance",
163
- "end_activities_q1", "end_activities_q3", "end_activities_iqr", "end_activities_skewness", "end_activities_kurtosis", "entropy_trace",
164
- "entropy_prefix", "entropy_global_block", "entropy_lempel_ziv", "entropy_k_block_diff_1", "entropy_k_block_diff_3",
165
- "entropy_k_block_diff_5", "entropy_k_block_ratio_1", "entropy_k_block_ratio_3", "entropy_k_block_ratio_5", "entropy_knn_3",
166
- "entropy_knn_5", "entropy_knn_7"]
167
- features = [log_name]
168
- features.extend(simple_stats(log))
169
- features.extend(trace_length(log))
170
- features.extend(trace_variant(log))
171
- features.extend(activities(log))
172
- features.extend(start_activities(log))
173
- features.extend(end_activities(log))
174
- features.extend(entropies(log_name, OUTPUT_PATH))
175
-
176
- mtf = pd.DataFrame([features], columns=mtf_cols)
177
-
178
- st.dataframe(mtf)
179
- return mtf
180
-
181
- def generate_pt(mtf):
182
- OUTPUT_PLOT = f"{OUTPUT_PATH}/generated_pt".replace(":","").replace(".","")#+".png" # OUTPUT_PATH is OUTPUT_PATH+INPUT_FILE
183
-
184
- st.write("### PT Gen configurations")
185
- col1, col2, col3, col4, col5, col6 = st.columns(6)
186
- with col1:
187
- param_mode = st.text_input('Mode', str(round(mtf['activities_median'].iat[0]))) #?
188
- st.write("Sum of probabilities must be one")
189
- with col2:
190
- param_min = st.text_input('Min', str(mtf['activities_min'].iat[0]))
191
- param_seq = st.text_input('Probability Sequence', 0.25)
192
- with col3:
193
- param_max = st.text_input('Max', str(mtf['activities_max'].iat[0]))
194
- param_cho = st.text_input('Probability Choice (XOR)', 0.25)
195
- with col4:
196
- param_nmo = st.text_input('Number of models', 1)
197
- param_par = st.text_input('Probability Parallel', 0.25)
198
- with col5:
199
- param_dup = st.text_input('Duplicates', 0)
200
- param_lop = st.text_input('Probability Loop', 0.25)
201
- with col6:
202
- param_sil = st.text_input('Silent', 0.2)
203
- param_or = st.text_input('Probability Or', 0.0)
204
-
205
- PT_PARAMS = {tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.MODE: round(float(param_mode)), #most frequent number of visible activities
206
- tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.MIN: int(param_min), #minimum number of visible activities
207
- tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.MAX: int(param_max), #maximum number of visible activities
208
- tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.SEQUENCE: float(param_seq), #probability to add a sequence operator to tree
209
- tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.CHOICE: float(param_cho), #probability to add a choice (XOR) operator to tree
210
- tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.PARALLEL: float(param_par), #probability to add a parallel operator to tree
211
- tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.LOOP: float(param_lop), #probability to add a loop operator to tree
212
- tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.OR: float(param_or), #probability to add an or operator to tree
213
- tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.SILENT: float(param_sil), #probability to add silent activity to a choice or loop operator
214
- tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.DUPLICATE: int(param_dup), #probability to duplicate an activity label
215
- tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.NO_MODELS: int(param_nmo)} #number of trees to generate from model population
216
-
217
- process_tree = generate_process_tree(parameters=PT_PARAMS)
218
- save_vis_process_tree(process_tree, OUTPUT_PLOT+"_tree.png")
219
-
220
- st.write("### Playout configurations")
221
-
222
- param_ntraces = st.text_input('Number of traces', str(mtf['n_traces'].iat[0]))
223
- PO_PARAMS = {playout.Variants.BASIC_PLAYOUT.value.Parameters.NO_TRACES : int(param_ntraces)}
224
-
225
- ptgen_log = play_out(process_tree, parameters=PO_PARAMS)
226
-
227
- net, im, fm = inductive_miner(ptgen_log)
228
- save_vis_petri_net(net, im, fm, OUTPUT_PLOT+".png")
229
- st.write("Saved in: ", OUTPUT_PLOT)
230
- fig_pt_net = mpimg.imread(OUTPUT_PLOT+".png")
231
- fig_pt_tree = mpimg.imread(OUTPUT_PLOT+"_tree.png")
232
-
233
- fcol1, fcol2 = st.columns(2)
234
- with fcol1:
235
- st.image(fig_pt_tree)
236
- with fcol2:
237
- st.image(fig_pt_net)
238
- extract_meta_features(ptgen_log, "gen_pt")
239
-
240
-
241
- if __name__ == '__main__':
242
- st.set_page_config(layout='wide')
243
- """
244
- # Event Log Generator
245
- """
246
- start_options = ['Event-Log', 'Meta-features']
247
- start_preference = st.sidebar.selectbox("Do you want to start with a log or with metafeatures?", start_options,0)
248
- #lets_start = st.sidebar.button("Let's start with "+start_preference+'!')
249
-
250
- if start_preference==start_options[0]:
251
- st.sidebar.write("Upload a dataset in csv or xes-format:")
252
- uploaded_file = st.sidebar.file_uploader("Pick a logfile")
253
-
254
- bar = st.progress(0)
255
-
256
- os.makedirs(OUTPUT_PATH, exist_ok=True)
257
- event_log = st.session_state['log'] if "log" in st.session_state else None
258
- if uploaded_file:
259
- event_log, event_df, case_id, activity_id = read_uploaded_file(uploaded_file)
260
- #event_log = deepcopy(event_log)
261
-
262
- use_sample = st.sidebar.checkbox('Use random sample', True)
263
- if use_sample:
264
- sample_size = st.sidebar.text_input('Sample size of approx number of events', str(SAMPLE_EVENTS))
265
- sample_size = int(sample_size)
266
-
267
- event_log = sample_log_traces(event_log, sample_size)
268
- sample_cases = [event_log[i].attributes['concept:name'] for i in range(0, len(event_log))]
269
- event_df = event_df[event_df[case_id].isin(sample_cases)]
270
-
271
- show_loaded_event_log(event_log, event_df)
272
- ext_mtf = extract_meta_features(event_log, "running-example")
273
- generate_pt(ext_mtf)
274
-
275
- elif start_preference==start_options[1]:
276
- LOG_COL = 'log'
277
- st.sidebar.write("Upload a dataset in csv-format")
278
- uploaded_file = st.sidebar.file_uploader("Pick a file containing meta-features")
279
-
280
- bar = st.progress(0)
281
-
282
- os.makedirs(OUTPUT_PATH, exist_ok=True)
283
- event_log = st.session_state[LOG_COL] if "log" in st.session_state else None
284
- if uploaded_file:
285
- sep = st.sidebar.text_input("Columns separator", ";")
286
- mtf = load_from_csv(uploaded_file, sep)
287
- st.dataframe(mtf)
288
-
289
- log_options = mtf['log'].unique()
290
- log_preference = st.selectbox("What log should we use for generating a new event-log?", log_options,1)
291
- mtf_selection = mtf[mtf[LOG_COL]==log_preference]
292
- generate_pt(mtf_selection)
293
- st.write("##### Original")
294
- st.write(mtf_selection)
295
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gedi/__init__.py CHANGED
@@ -1,6 +1,5 @@
1
  from .generator import GenerateEventLogs
2
  from .features import EventLogFeatures
3
- from .analyser import FeatureAnalyser
4
  from .augmentation import InstanceAugmentator
5
  from .benchmark import BenchmarkTest
6
  from .plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter
 
1
  from .generator import GenerateEventLogs
2
  from .features import EventLogFeatures
 
3
  from .augmentation import InstanceAugmentator
4
  from .benchmark import BenchmarkTest
5
  from .plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter
gedi/analyser.py DELETED
@@ -1,123 +0,0 @@
1
- import numpy as np
2
- import warnings
3
-
4
- from sklearn.decomposition import FastICA, PCA
5
- from sklearn.manifold import TSNE
6
- from sklearn.preprocessing import Normalizer, StandardScaler
7
- from gedi.features import EventLogFeatures
8
- from gedi.plotter import ModelResultPlotter
9
- from gedi.utils.matrix_tools import insert_missing_data
10
- # TODO: Call param_keys explicitly e.g. import INPUT_PATH
11
- from utils.param_keys import *
12
- from utils.param_keys.analyser import MODEL, INPUT_PARAMS, PERPLEXITY
13
-
14
-
15
- # FUDO: Use this class to compare models during evaluation
16
- class FeatureAnalyser:
17
- def __init__(self, features, params=None):
18
- self.features: EventLogFeatures = features
19
- self.params: dict = {
20
- PLOT_TYPE: params.get(PLOT_TYPE, COLOR_MAP),
21
- PLOT_TICS: params.get(PLOT_TICS, True),
22
- INTERACTIVE: params.get(INTERACTIVE, True),
23
- N_COMPONENTS: params.get(N_COMPONENTS, 2),
24
- PERPLEXITY: params.get(PERPLEXITY, 3)
25
- }
26
- def compare(self, model_parameter_list: list[dict], plot_results: bool = True) -> list[dict]:
27
- """
28
- :param model_parameter_list: list[dict]
29
- Different model input parameters, saved in a list
30
- :param plot_results: bool
31
- Plots the components of the different models (default: True)
32
- The function can be calculated
33
- :return: list[dict]
34
- The results of the models {MODEL, PROJECTION, EXPLAINED_VAR, INPUT_PARAMS}
35
- """
36
- model_results = []
37
- for model_parameters in model_parameter_list:
38
- try:
39
- model_results.append(self.get_model_result(model_parameters))
40
- except np.linalg.LinAlgError as e:
41
- warnings.warn(f'Eigenvalue decomposition for model `{model_parameters}` could not be calculated:\n {e}')
42
- except AssertionError as e:
43
- warnings.warn(f'{e}')
44
-
45
- if plot_results:
46
- self.compare_with_plot(model_results)
47
-
48
- return model_results
49
-
50
- def compare_with_plot(self, model_results_list):
51
- """
52
- This method is used to compare the results in a plot, after fit_transforming different models.
53
- @param model_results_list: list[dict]
54
- Different model input parameters, saved in a list.
55
- """
56
- ModelResultPlotter().plot_models(
57
- model_results_list,
58
- plot_type=self.params[PLOT_TYPE],
59
- plot_tics=self.params[PLOT_TICS],
60
- components=self.params[N_COMPONENTS]
61
- )
62
-
63
- def get_model_result(self, model_parameters: dict, log: bool = True) -> dict:
64
- """
65
- Returns a dict of all the important result values. Used for analysing the different models
66
- :param model_parameters: dict
67
- The input parameters for the model
68
- :param log: bool
69
- Enables the log output while running the program (default: True)
70
- :return: dict of the results: {MODEL, PROJECTION, EXPLAINED_VAR, INPUT_PARAMS}
71
- """
72
- model, projection = self.get_model_and_projection(model_parameters, log=log)
73
- try:
74
- ex_var = model.explained_variance_ratio_
75
- except AttributeError as e:
76
- warnings.warn(str(e))
77
- ex_var = 0
78
- return {MODEL: model, PROJECTION: projection, EXPLAINED_VAR: ex_var, INPUT_PARAMS: model_parameters}
79
-
80
- def get_model_and_projection(self, model_parameters: dict, inp: np.ndarray = None, log: bool = True):
81
- """
82
- This method is fitting a model with the given parameters :model_parameters: and
83
- the inp(ut) data is transformed on the model.
84
- @param model_parameters: dict
85
- The input parameters for the model.
86
- @param inp: np.ndarray
87
- Input data for the model (optional), (default: None -> calculated on the basis of the model_parameters)
88
- @param log: bool
89
- Enables the log output while running the program (default: True)
90
- @return: fitted model and transformed data
91
- """
92
- if log:
93
- print(f'Running {model_parameters}...')
94
-
95
- if inp is None:
96
- inp = insert_missing_data(self.features.feat)
97
-
98
- if ALGORITHM_NAME not in model_parameters.keys():
99
- raise KeyError(f'{ALGORITHM_NAME} is a mandatory model parameter.')
100
-
101
- if model_parameters[ALGORITHM_NAME].startswith('normalized'):
102
- inp = Normalizer(norm="l2").fit_transform(inp)
103
- elif model_parameters[ALGORITHM_NAME].startswith('std_scaled'):
104
- scaler = StandardScaler()
105
- inp = scaler.fit_transform(inp)
106
- try:
107
- if 'pca' in model_parameters[ALGORITHM_NAME]:
108
- # from sklearn.decomposition import PCA
109
- pca = PCA(n_components=self.params[N_COMPONENTS])
110
- # pca = coor.pca(data=inp, dim=self.params[N_COMPONENTS])
111
- return pca, pca.fit_transform(inp)
112
- elif 'tsne' in model_parameters[ALGORITHM_NAME]:
113
- tsne = TSNE(n_components=self.params[N_COMPONENTS], learning_rate='auto',
114
- init='random', perplexity=self.params[PERPLEXITY])
115
- return tsne, tsne.fit_transform(inp)
116
- #elif model_parameters[ALGORITHM_NAME] == 'original_ica':
117
- # ica = FastICA(n_components=self.params[N_COMPONENTS])
118
- # return ica, ica.fit_transform(inp)
119
- else:
120
- warnings.warn(f'No original algorithm was found with name: {model_parameters[ALGORITHM_NAME]}')
121
- except TypeError:
122
- raise TypeError(f'Input data of the function is not correct. '
123
- f'Original algorithms take only 2-n-dimensional ndarray')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gedi/utils/algorithms/__init__.py DELETED
@@ -1,67 +0,0 @@
1
- import numpy as np
2
- from sklearn.base import TransformerMixin, BaseEstimator
3
-
4
- from utils.param_keys import N_COMPONENTS
5
-
6
-
7
- class MyModel(TransformerMixin, BaseEstimator):
8
- """
9
- This class and some child classes are partly copied from:
10
- https://towardsdatascience.com/implementing-pca-from-scratch-fb434f1acbaa
11
- and commented with the help of:
12
- https://www.askpython.com/python/examples/principal-component-analysis
13
- """
14
- def __init__(self):
15
- self.explained_variance_ = None
16
- self.components_ = None
17
- self._standardized_data = None
18
- self.n_components = None
19
- self.n_samples = None
20
- self._covariance_matrix = None
21
-
22
- def __str__(self):
23
- return f'{self.__class__.__name__}:\ncomponents={self.n_components}'
24
-
25
- def fit_transform(self, data_ndarray, **fit_params):
26
- self.fit(data_ndarray, **fit_params)
27
- return self.transform(data_ndarray)
28
-
29
- def fit(self, data_matrix, **fit_params):
30
- self.n_samples = data_matrix.shape[0]
31
- self.n_components = fit_params.get(N_COMPONENTS, 2)
32
- self._standardized_data = self._standardize_data(data_matrix)
33
- self._covariance_matrix = self.get_covariance_matrix()
34
- self.components_ = self.get_eigenvectors()
35
- return self
36
-
37
- @staticmethod
38
- def _standardize_data(matrix):
39
- """
40
- Subtract mean and divide by standard deviation column-wise.
41
- Doing this proves to be very helpful when calculating the covariance matrix.
42
- https://towardsdatascience.com/understanding-the-covariance-matrix-92076554ea44
43
- Mean-Center the data
44
- :param matrix: Data as matrix
45
- :return: Standardized data matrix
46
- """
47
- numerator = matrix - np.mean(matrix, axis=0)
48
- denominator = np.std(matrix, axis=0)
49
- return numerator / denominator
50
-
51
- def get_covariance_matrix(self):
52
- """
53
- Calculate covariance matrix with standardized matrix A
54
- :return: Covariance Matrix
55
- """
56
- return np.cov(self._standardized_data.T)
57
-
58
- def get_eigenvectors(self):
59
- pass
60
-
61
- def transform(self, data_matrix):
62
- """
63
- Project the data to the lower dimension with the help of the eigenvectors.
64
- :return: Data reduced to lower dimensions from higher dimensions
65
- """
66
- data_matrix_standardized = self._standardize_data(data_matrix)
67
- return np.dot(data_matrix_standardized, self.components_[:, :self.n_components])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gedi/utils/algorithms/tsne.py DELETED
@@ -1,69 +0,0 @@
1
- from scipy import spatial
2
-
3
- from tag.utils.algorithms import MyModel
4
- import sklearn.manifold as sk
5
- import numpy as np
6
-
7
- from tag.utils.matrix_tools import ensure_matrix_symmetry
8
-
9
- """
10
- Parts of this file were originally copied from the tltsne python module.
11
- https://github.com/spiwokv/tltsne/blob/master/tltsne/__init__.py
12
- Since the results in the text file are complicated to reuse, this module was modified somewhat.
13
- This way, the results of the models can be used and it's Object Oriented.
14
- """
15
-
16
-
17
- class MyTSNE(MyModel):
18
- def __init__(self, n_components, perplexity=7.0,
19
- early_exaggeration=12.0, learning_rate="auto",
20
- n_iter=1000, metric="euclidean"):
21
- super().__init__()
22
- self.model = sk.TSNE(
23
- n_components=n_components, perplexity=perplexity,
24
- early_exaggeration=early_exaggeration, learning_rate=learning_rate,
25
- n_iter=n_iter, metric=metric
26
- )
27
-
28
- def fit_transform(self, data_matrix, **fit_params):
29
- return self.model.fit_transform(data_matrix, **fit_params)
30
-
31
-
32
- class MyTimeLaggedTSNE(MyTSNE):
33
- def __init__(self, lag_time, **kwargs):
34
- super().__init__(metric="precomputed", **kwargs)
35
- self.lag_time = lag_time
36
-
37
- def fit_transform(self, data_matrix, **fit_params):
38
- data_zero_mean = data_matrix - np.mean(data_matrix, axis=0)
39
- cov = np.cov(data_zero_mean.T)
40
- eigenvalue, eigenvector = np.linalg.eig(cov)
41
- eigenvalue_order = np.argsort(eigenvalue)[::-1]
42
- eigenvector = eigenvector[:, eigenvalue_order]
43
- eigenvalue = eigenvalue[eigenvalue_order]
44
- projection = data_zero_mean.dot(eigenvector) / np.sqrt(eigenvalue)
45
-
46
- n_frames = fit_params.get('n_frames', 0)
47
- if self.lag_time <= 0:
48
- covariance_matrix = np.dot(
49
- projection[:, np.newaxis].T,
50
- projection[:, np.newaxis]
51
- ) / (n_frames - 1)
52
- else:
53
- covariance_matrix = np.dot(
54
- projection[:-self.lag_time, np.newaxis].T,
55
- projection[self.lag_time:, np.newaxis]
56
- ) / (n_frames - self.lag_time - 1)
57
- covariance_matrix = ensure_matrix_symmetry(covariance_matrix)
58
-
59
- eigenvalue2, eigenvector2 = np.linalg.eig(covariance_matrix)
60
- eigenvalue_order = np.argsort(eigenvalue2)[::-1]
61
- eigenvector2 = eigenvector2[:, eigenvalue_order]
62
- eigenvalue2 = eigenvalue2[eigenvalue_order]
63
- projection = np.dot(
64
- projection,
65
- eigenvector2[:, :self.n_components]
66
- ) * np.sqrt(np.real(eigenvalue2[:self.n_components]))
67
- data_distance = spatial.distance_matrix(projection, projection)
68
-
69
- return self.model.fit_transform(data_distance)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py CHANGED
@@ -3,7 +3,6 @@ import pandas as pd
3
  from datetime import datetime as dt
4
  from gedi.generator import GenerateEventLogs
5
  from gedi.features import EventLogFeatures
6
- from gedi.analyser import FeatureAnalyser
7
  from gedi.augmentation import InstanceAugmentator
8
  from gedi.benchmark import BenchmarkTest
9
  from gedi.plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter
 
3
  from datetime import datetime as dt
4
  from gedi.generator import GenerateEventLogs
5
  from gedi.features import EventLogFeatures
 
6
  from gedi.augmentation import InstanceAugmentator
7
  from gedi.benchmark import BenchmarkTest
8
  from gedi.plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter