Spaces:
Running
Running
Andrea Maldonado
commited on
Commit
·
3735e7d
1
Parent(s):
3c2100c
Remove unused
Browse files- config_files/algorithm/fix_24.json +0 -34
- dashboard.py +0 -295
- gedi/__init__.py +0 -1
- gedi/analyser.py +0 -123
- gedi/utils/algorithms/__init__.py +0 -67
- gedi/utils/algorithms/tsne.py +0 -69
- main.py +0 -1
config_files/algorithm/fix_24.json
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"pipeline_step": "event_logs_generation",
|
4 |
-
"output_path":"data/generated",
|
5 |
-
"generator_params": {
|
6 |
-
"objectives": {
|
7 |
-
"normalized_sequence_entropy_linear_forgetting": 0.05,
|
8 |
-
"ratio_top_20_variants": 0.4
|
9 |
-
},
|
10 |
-
"config_space": {
|
11 |
-
"mode": [5, 40],
|
12 |
-
"sequence": [0.01, 1],
|
13 |
-
"choice": [0.01, 1],
|
14 |
-
"parallel": [0.01, 1],
|
15 |
-
"loop": [0.01, 1],
|
16 |
-
"silent": [0.01, 1],
|
17 |
-
"lt_dependency": [0.01, 1],
|
18 |
-
"num_traces": [100, 1001],
|
19 |
-
"duplicate": [0],
|
20 |
-
"or": [0]
|
21 |
-
},
|
22 |
-
"n_trials": 20
|
23 |
-
}
|
24 |
-
},
|
25 |
-
{
|
26 |
-
"pipeline_step": "feature_extraction",
|
27 |
-
"input_path": "data/generated",
|
28 |
-
"feature_params": {"feature_set":["simple_stats", "trace_length", "trace_variant", "activities", "start_activities", "end_activities", "entropies", "complexity"]},
|
29 |
-
"feature_params": {"feature_set":["trace_length"]},
|
30 |
-
"output_path": "output/plots",
|
31 |
-
"real_eventlog_path": "data/log_meta_features.csv",
|
32 |
-
"plot_type": "boxplot"
|
33 |
-
}
|
34 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dashboard.py
DELETED
@@ -1,295 +0,0 @@
|
|
1 |
-
from copy import deepcopy
|
2 |
-
from meta_feature_extraction.simple_stats import simple_stats
|
3 |
-
from meta_feature_extraction.trace_length import trace_length
|
4 |
-
from meta_feature_extraction.trace_variant import trace_variant
|
5 |
-
from meta_feature_extraction.activities import activities
|
6 |
-
from meta_feature_extraction.start_activities import start_activities
|
7 |
-
from meta_feature_extraction.end_activities import end_activities
|
8 |
-
from meta_feature_extraction.entropies import entropies
|
9 |
-
from pm4py import discover_petri_net_inductive as inductive_miner
|
10 |
-
from pm4py import generate_process_tree
|
11 |
-
from pm4py import save_vis_petri_net, save_vis_process_tree
|
12 |
-
from pm4py.algo.filtering.log.variants import variants_filter
|
13 |
-
from pm4py.algo.simulation.tree_generator import algorithm as tree_generator
|
14 |
-
from pm4py.algo.simulation.playout.process_tree import algorithm as playout
|
15 |
-
from pm4py.objects.conversion.log import converter as log_converter
|
16 |
-
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
|
17 |
-
from pm4py.objects.log.importer.xes import importer as xes_importer
|
18 |
-
from pm4py.objects.log.util import dataframe_utils
|
19 |
-
from pm4py.sim import play_out
|
20 |
-
|
21 |
-
import matplotlib.image as mpimg
|
22 |
-
import os
|
23 |
-
import pandas as pd
|
24 |
-
import streamlit as st
|
25 |
-
|
26 |
-
OUTPUT_PATH = "output"
|
27 |
-
SAMPLE_EVENTS = 500
|
28 |
-
|
29 |
-
@st.cache(allow_output_mutation=True)
|
30 |
-
def load_from_xes(uploaded_file):
|
31 |
-
bytes_data = uploaded_file.getvalue()
|
32 |
-
log1 = xes_importer.deserialize(bytes_data)
|
33 |
-
get_stats(log1)
|
34 |
-
return log1
|
35 |
-
|
36 |
-
@st.cache
|
37 |
-
def load_from_csv(uploaded_file, sep):
|
38 |
-
if uploaded_file is not None:
|
39 |
-
df = pd.read_csv(uploaded_file, sep=sep, index_col=False)
|
40 |
-
return df
|
41 |
-
|
42 |
-
def get_stats(log, save=True):
|
43 |
-
"""Returns the statistics of an event log."""
|
44 |
-
num_traces = len(log)
|
45 |
-
num_events = sum([len(c) for c in log])
|
46 |
-
num_utraces = len(variants_filter.get_variants(log))
|
47 |
-
if save:
|
48 |
-
st.session_state["num_traces"] = num_traces
|
49 |
-
st.session_state["num_events"] = num_events
|
50 |
-
st.session_state["num_utraces"] = num_utraces
|
51 |
-
return num_utraces, num_traces, num_events
|
52 |
-
|
53 |
-
#@st.cache
|
54 |
-
def df_to_log(df, case_id, activity, timestamp):
|
55 |
-
df.rename(columns={case_id: 'case:concept:name',
|
56 |
-
activity: 'concept:name',
|
57 |
-
timestamp: "time:timestamp"}, inplace=True)
|
58 |
-
temp = dataframe_utils.convert_timestamp_columns_in_df(df)
|
59 |
-
#temp = temp.sort_values(timestamp)
|
60 |
-
log = log_converter.apply(temp)
|
61 |
-
return log, 'concept:name', "time:timestamp"
|
62 |
-
|
63 |
-
def read_uploaded_file(uploaded_file):
|
64 |
-
extension = uploaded_file.name.split('.')[-1]
|
65 |
-
log_name = uploaded_file.name.split('.')[-2]
|
66 |
-
|
67 |
-
st.sidebar.write("Loaded ", extension.upper(), '-File: ', uploaded_file.name)
|
68 |
-
if extension == "xes":
|
69 |
-
event_log = load_from_xes(uploaded_file)
|
70 |
-
log_columns = [*list(event_log[0][0].keys())]
|
71 |
-
convert_button = False
|
72 |
-
case_id = "case:concept:name"
|
73 |
-
activity = "concept:name"
|
74 |
-
timestamp = "time:timestamp"
|
75 |
-
default_act_id = log_columns.index("concept:name")
|
76 |
-
default_tst_id = log_columns.index("time:timestamp")
|
77 |
-
|
78 |
-
event_df = log_converter.apply(event_log, variant=log_converter.Variants.TO_DATA_FRAME)
|
79 |
-
df_path = OUTPUT_PATH+"/"+log_name+".csv"
|
80 |
-
event_df.to_csv(df_path, sep =";", index=False)
|
81 |
-
return event_log, event_df, case_id, activity
|
82 |
-
|
83 |
-
elif extension == "csv":
|
84 |
-
sep = st.sidebar.text_input("Columns separator", ";")
|
85 |
-
event_df = load_from_csv(uploaded_file, sep)
|
86 |
-
old_df = deepcopy(event_df)
|
87 |
-
log_columns = event_df.columns
|
88 |
-
|
89 |
-
case_id = st.sidebar.selectbox("Choose 'case' column:", log_columns)
|
90 |
-
activity = st.sidebar.selectbox("Choose 'activity' column:", log_columns, index=0)
|
91 |
-
timestamp = st.sidebar.selectbox("Choose 'timestamp' column:", log_columns, index=0)
|
92 |
-
|
93 |
-
convert_button = st.sidebar.button('Confirm selection')
|
94 |
-
if convert_button:
|
95 |
-
temp = deepcopy(event_df)
|
96 |
-
event_log, activity, timestamp = df_to_log(temp, case_id, activity, timestamp)
|
97 |
-
#xes_exporter.apply(event_log, INPUT_XES)
|
98 |
-
log_columns = [*list(event_log[0][0].keys())]
|
99 |
-
st.session_state['log'] = event_log
|
100 |
-
return event_log, event_df, case_id, activity
|
101 |
-
|
102 |
-
def sample_log_traces(complete_log, sample_size):
|
103 |
-
'''
|
104 |
-
Samples random traces out of logs.
|
105 |
-
So that number of events is slightly over SAMPLE_SIZE.
|
106 |
-
:param complete_log: Log extracted from xes
|
107 |
-
'''
|
108 |
-
|
109 |
-
log_traces = variants_filter.get_variants(complete_log)
|
110 |
-
keys = list(log_traces.keys())
|
111 |
-
sample_traces = {}
|
112 |
-
num_evs = 0
|
113 |
-
while num_evs < sample_size:
|
114 |
-
if len(keys) == 0:
|
115 |
-
break
|
116 |
-
random_trace = keys.pop()
|
117 |
-
sample_traces[random_trace] = log_traces[random_trace]
|
118 |
-
evs = sum([len(case_id) for case_id in sample_traces[random_trace]])
|
119 |
-
num_evs += evs
|
120 |
-
log1 = variants_filter.apply(complete_log, sample_traces)
|
121 |
-
return log1
|
122 |
-
|
123 |
-
def show_process_petrinet(event_log, filter_info, OUTPUT_PATH):
|
124 |
-
OUTPUT_PLOT = f"{OUTPUT_PATH}_{filter_info}".replace(":","").replace(".","")+".png" # OUTPUT_PATH is OUTPUT_PATH+INPUT_FILE
|
125 |
-
|
126 |
-
try:
|
127 |
-
fig_pt = mpimg.imread(OUTPUT_PLOT)
|
128 |
-
st.write("Loaded from memory")
|
129 |
-
except FileNotFoundError:
|
130 |
-
net, im, fm = inductive_miner(event_log)
|
131 |
-
# parameters={heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: 0.99,
|
132 |
-
# pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"})
|
133 |
-
#parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"}
|
134 |
-
save_vis_petri_net(net, im, fm, OUTPUT_PLOT)
|
135 |
-
st.write("Saved in: ", OUTPUT_PLOT)
|
136 |
-
fig_pt = mpimg.imread(OUTPUT_PLOT)
|
137 |
-
st.image(fig_pt)
|
138 |
-
|
139 |
-
def show_loaded_event_log(event_log, event_df):
|
140 |
-
get_stats(event_log)
|
141 |
-
st.write("### Loaded event-log")
|
142 |
-
col1, col2 = st.columns(2)
|
143 |
-
with col2:
|
144 |
-
st.dataframe(event_df)
|
145 |
-
with col1:
|
146 |
-
show_process_petrinet(event_log, None, OUTPUT_PATH+"running-example")
|
147 |
-
|
148 |
-
def extract_meta_features(log, log_name):
|
149 |
-
mtf_cols = ["log", "n_traces", "n_unique_traces", "ratio_unique_traces_per_trace", "n_events", "trace_len_min", "trace_len_max",
|
150 |
-
"trace_len_mean", "trace_len_median", "trace_len_mode", "trace_len_std", "trace_len_variance", "trace_len_q1",
|
151 |
-
"trace_len_q3", "trace_len_iqr", "trace_len_geometric_mean", "trace_len_geometric_std", "trace_len_harmonic_mean",
|
152 |
-
"trace_len_skewness", "trace_len_kurtosis", "trace_len_coefficient_variation", "trace_len_entropy", "trace_len_hist1",
|
153 |
-
"trace_len_hist2", "trace_len_hist3", "trace_len_hist4", "trace_len_hist5", "trace_len_hist6", "trace_len_hist7",
|
154 |
-
"trace_len_hist8", "trace_len_hist9", "trace_len_hist10", "trace_len_skewness_hist", "trace_len_kurtosis_hist",
|
155 |
-
"ratio_most_common_variant", "ratio_top_1_variants", "ratio_top_5_variants", "ratio_top_10_variants", "ratio_top_20_variants",
|
156 |
-
"ratio_top_50_variants", "ratio_top_75_variants", "mean_variant_occurrence", "std_variant_occurrence", "skewness_variant_occurrence",
|
157 |
-
"kurtosis_variant_occurrence", "n_unique_activities", "activities_min", "activities_max", "activities_mean", "activities_median",
|
158 |
-
"activities_std", "activities_variance", "activities_q1", "activities_q3", "activities_iqr", "activities_skewness",
|
159 |
-
"activities_kurtosis", "n_unique_start_activities", "start_activities_min", "start_activities_max", "start_activities_mean",
|
160 |
-
"start_activities_median", "start_activities_std", "start_activities_variance", "start_activities_q1", "start_activities_q3",
|
161 |
-
"start_activities_iqr", "start_activities_skewness", "start_activities_kurtosis", "n_unique_end_activities", "end_activities_min",
|
162 |
-
"end_activities_max", "end_activities_mean", "end_activities_median", "end_activities_std", "end_activities_variance",
|
163 |
-
"end_activities_q1", "end_activities_q3", "end_activities_iqr", "end_activities_skewness", "end_activities_kurtosis", "entropy_trace",
|
164 |
-
"entropy_prefix", "entropy_global_block", "entropy_lempel_ziv", "entropy_k_block_diff_1", "entropy_k_block_diff_3",
|
165 |
-
"entropy_k_block_diff_5", "entropy_k_block_ratio_1", "entropy_k_block_ratio_3", "entropy_k_block_ratio_5", "entropy_knn_3",
|
166 |
-
"entropy_knn_5", "entropy_knn_7"]
|
167 |
-
features = [log_name]
|
168 |
-
features.extend(simple_stats(log))
|
169 |
-
features.extend(trace_length(log))
|
170 |
-
features.extend(trace_variant(log))
|
171 |
-
features.extend(activities(log))
|
172 |
-
features.extend(start_activities(log))
|
173 |
-
features.extend(end_activities(log))
|
174 |
-
features.extend(entropies(log_name, OUTPUT_PATH))
|
175 |
-
|
176 |
-
mtf = pd.DataFrame([features], columns=mtf_cols)
|
177 |
-
|
178 |
-
st.dataframe(mtf)
|
179 |
-
return mtf
|
180 |
-
|
181 |
-
def generate_pt(mtf):
|
182 |
-
OUTPUT_PLOT = f"{OUTPUT_PATH}/generated_pt".replace(":","").replace(".","")#+".png" # OUTPUT_PATH is OUTPUT_PATH+INPUT_FILE
|
183 |
-
|
184 |
-
st.write("### PT Gen configurations")
|
185 |
-
col1, col2, col3, col4, col5, col6 = st.columns(6)
|
186 |
-
with col1:
|
187 |
-
param_mode = st.text_input('Mode', str(round(mtf['activities_median'].iat[0]))) #?
|
188 |
-
st.write("Sum of probabilities must be one")
|
189 |
-
with col2:
|
190 |
-
param_min = st.text_input('Min', str(mtf['activities_min'].iat[0]))
|
191 |
-
param_seq = st.text_input('Probability Sequence', 0.25)
|
192 |
-
with col3:
|
193 |
-
param_max = st.text_input('Max', str(mtf['activities_max'].iat[0]))
|
194 |
-
param_cho = st.text_input('Probability Choice (XOR)', 0.25)
|
195 |
-
with col4:
|
196 |
-
param_nmo = st.text_input('Number of models', 1)
|
197 |
-
param_par = st.text_input('Probability Parallel', 0.25)
|
198 |
-
with col5:
|
199 |
-
param_dup = st.text_input('Duplicates', 0)
|
200 |
-
param_lop = st.text_input('Probability Loop', 0.25)
|
201 |
-
with col6:
|
202 |
-
param_sil = st.text_input('Silent', 0.2)
|
203 |
-
param_or = st.text_input('Probability Or', 0.0)
|
204 |
-
|
205 |
-
PT_PARAMS = {tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.MODE: round(float(param_mode)), #most frequent number of visible activities
|
206 |
-
tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.MIN: int(param_min), #minimum number of visible activities
|
207 |
-
tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.MAX: int(param_max), #maximum number of visible activities
|
208 |
-
tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.SEQUENCE: float(param_seq), #probability to add a sequence operator to tree
|
209 |
-
tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.CHOICE: float(param_cho), #probability to add a choice (XOR) operator to tree
|
210 |
-
tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.PARALLEL: float(param_par), #probability to add a parallel operator to tree
|
211 |
-
tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.LOOP: float(param_lop), #probability to add a loop operator to tree
|
212 |
-
tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.OR: float(param_or), #probability to add an or operator to tree
|
213 |
-
tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.SILENT: float(param_sil), #probability to add silent activity to a choice or loop operator
|
214 |
-
tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.DUPLICATE: int(param_dup), #probability to duplicate an activity label
|
215 |
-
tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.NO_MODELS: int(param_nmo)} #number of trees to generate from model population
|
216 |
-
|
217 |
-
process_tree = generate_process_tree(parameters=PT_PARAMS)
|
218 |
-
save_vis_process_tree(process_tree, OUTPUT_PLOT+"_tree.png")
|
219 |
-
|
220 |
-
st.write("### Playout configurations")
|
221 |
-
|
222 |
-
param_ntraces = st.text_input('Number of traces', str(mtf['n_traces'].iat[0]))
|
223 |
-
PO_PARAMS = {playout.Variants.BASIC_PLAYOUT.value.Parameters.NO_TRACES : int(param_ntraces)}
|
224 |
-
|
225 |
-
ptgen_log = play_out(process_tree, parameters=PO_PARAMS)
|
226 |
-
|
227 |
-
net, im, fm = inductive_miner(ptgen_log)
|
228 |
-
save_vis_petri_net(net, im, fm, OUTPUT_PLOT+".png")
|
229 |
-
st.write("Saved in: ", OUTPUT_PLOT)
|
230 |
-
fig_pt_net = mpimg.imread(OUTPUT_PLOT+".png")
|
231 |
-
fig_pt_tree = mpimg.imread(OUTPUT_PLOT+"_tree.png")
|
232 |
-
|
233 |
-
fcol1, fcol2 = st.columns(2)
|
234 |
-
with fcol1:
|
235 |
-
st.image(fig_pt_tree)
|
236 |
-
with fcol2:
|
237 |
-
st.image(fig_pt_net)
|
238 |
-
extract_meta_features(ptgen_log, "gen_pt")
|
239 |
-
|
240 |
-
|
241 |
-
if __name__ == '__main__':
|
242 |
-
st.set_page_config(layout='wide')
|
243 |
-
"""
|
244 |
-
# Event Log Generator
|
245 |
-
"""
|
246 |
-
start_options = ['Event-Log', 'Meta-features']
|
247 |
-
start_preference = st.sidebar.selectbox("Do you want to start with a log or with metafeatures?", start_options,0)
|
248 |
-
#lets_start = st.sidebar.button("Let's start with "+start_preference+'!')
|
249 |
-
|
250 |
-
if start_preference==start_options[0]:
|
251 |
-
st.sidebar.write("Upload a dataset in csv or xes-format:")
|
252 |
-
uploaded_file = st.sidebar.file_uploader("Pick a logfile")
|
253 |
-
|
254 |
-
bar = st.progress(0)
|
255 |
-
|
256 |
-
os.makedirs(OUTPUT_PATH, exist_ok=True)
|
257 |
-
event_log = st.session_state['log'] if "log" in st.session_state else None
|
258 |
-
if uploaded_file:
|
259 |
-
event_log, event_df, case_id, activity_id = read_uploaded_file(uploaded_file)
|
260 |
-
#event_log = deepcopy(event_log)
|
261 |
-
|
262 |
-
use_sample = st.sidebar.checkbox('Use random sample', True)
|
263 |
-
if use_sample:
|
264 |
-
sample_size = st.sidebar.text_input('Sample size of approx number of events', str(SAMPLE_EVENTS))
|
265 |
-
sample_size = int(sample_size)
|
266 |
-
|
267 |
-
event_log = sample_log_traces(event_log, sample_size)
|
268 |
-
sample_cases = [event_log[i].attributes['concept:name'] for i in range(0, len(event_log))]
|
269 |
-
event_df = event_df[event_df[case_id].isin(sample_cases)]
|
270 |
-
|
271 |
-
show_loaded_event_log(event_log, event_df)
|
272 |
-
ext_mtf = extract_meta_features(event_log, "running-example")
|
273 |
-
generate_pt(ext_mtf)
|
274 |
-
|
275 |
-
elif start_preference==start_options[1]:
|
276 |
-
LOG_COL = 'log'
|
277 |
-
st.sidebar.write("Upload a dataset in csv-format")
|
278 |
-
uploaded_file = st.sidebar.file_uploader("Pick a file containing meta-features")
|
279 |
-
|
280 |
-
bar = st.progress(0)
|
281 |
-
|
282 |
-
os.makedirs(OUTPUT_PATH, exist_ok=True)
|
283 |
-
event_log = st.session_state[LOG_COL] if "log" in st.session_state else None
|
284 |
-
if uploaded_file:
|
285 |
-
sep = st.sidebar.text_input("Columns separator", ";")
|
286 |
-
mtf = load_from_csv(uploaded_file, sep)
|
287 |
-
st.dataframe(mtf)
|
288 |
-
|
289 |
-
log_options = mtf['log'].unique()
|
290 |
-
log_preference = st.selectbox("What log should we use for generating a new event-log?", log_options,1)
|
291 |
-
mtf_selection = mtf[mtf[LOG_COL]==log_preference]
|
292 |
-
generate_pt(mtf_selection)
|
293 |
-
st.write("##### Original")
|
294 |
-
st.write(mtf_selection)
|
295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gedi/__init__.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
from .generator import GenerateEventLogs
|
2 |
from .features import EventLogFeatures
|
3 |
-
from .analyser import FeatureAnalyser
|
4 |
from .augmentation import InstanceAugmentator
|
5 |
from .benchmark import BenchmarkTest
|
6 |
from .plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter
|
|
|
1 |
from .generator import GenerateEventLogs
|
2 |
from .features import EventLogFeatures
|
|
|
3 |
from .augmentation import InstanceAugmentator
|
4 |
from .benchmark import BenchmarkTest
|
5 |
from .plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter
|
gedi/analyser.py
DELETED
@@ -1,123 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import warnings
|
3 |
-
|
4 |
-
from sklearn.decomposition import FastICA, PCA
|
5 |
-
from sklearn.manifold import TSNE
|
6 |
-
from sklearn.preprocessing import Normalizer, StandardScaler
|
7 |
-
from gedi.features import EventLogFeatures
|
8 |
-
from gedi.plotter import ModelResultPlotter
|
9 |
-
from gedi.utils.matrix_tools import insert_missing_data
|
10 |
-
# TODO: Call param_keys explicitly e.g. import INPUT_PATH
|
11 |
-
from utils.param_keys import *
|
12 |
-
from utils.param_keys.analyser import MODEL, INPUT_PARAMS, PERPLEXITY
|
13 |
-
|
14 |
-
|
15 |
-
# FUDO: Use this class to compare models during evaluation
|
16 |
-
class FeatureAnalyser:
|
17 |
-
def __init__(self, features, params=None):
|
18 |
-
self.features: EventLogFeatures = features
|
19 |
-
self.params: dict = {
|
20 |
-
PLOT_TYPE: params.get(PLOT_TYPE, COLOR_MAP),
|
21 |
-
PLOT_TICS: params.get(PLOT_TICS, True),
|
22 |
-
INTERACTIVE: params.get(INTERACTIVE, True),
|
23 |
-
N_COMPONENTS: params.get(N_COMPONENTS, 2),
|
24 |
-
PERPLEXITY: params.get(PERPLEXITY, 3)
|
25 |
-
}
|
26 |
-
def compare(self, model_parameter_list: list[dict], plot_results: bool = True) -> list[dict]:
|
27 |
-
"""
|
28 |
-
:param model_parameter_list: list[dict]
|
29 |
-
Different model input parameters, saved in a list
|
30 |
-
:param plot_results: bool
|
31 |
-
Plots the components of the different models (default: True)
|
32 |
-
The function can be calculated
|
33 |
-
:return: list[dict]
|
34 |
-
The results of the models {MODEL, PROJECTION, EXPLAINED_VAR, INPUT_PARAMS}
|
35 |
-
"""
|
36 |
-
model_results = []
|
37 |
-
for model_parameters in model_parameter_list:
|
38 |
-
try:
|
39 |
-
model_results.append(self.get_model_result(model_parameters))
|
40 |
-
except np.linalg.LinAlgError as e:
|
41 |
-
warnings.warn(f'Eigenvalue decomposition for model `{model_parameters}` could not be calculated:\n {e}')
|
42 |
-
except AssertionError as e:
|
43 |
-
warnings.warn(f'{e}')
|
44 |
-
|
45 |
-
if plot_results:
|
46 |
-
self.compare_with_plot(model_results)
|
47 |
-
|
48 |
-
return model_results
|
49 |
-
|
50 |
-
def compare_with_plot(self, model_results_list):
|
51 |
-
"""
|
52 |
-
This method is used to compare the results in a plot, after fit_transforming different models.
|
53 |
-
@param model_results_list: list[dict]
|
54 |
-
Different model input parameters, saved in a list.
|
55 |
-
"""
|
56 |
-
ModelResultPlotter().plot_models(
|
57 |
-
model_results_list,
|
58 |
-
plot_type=self.params[PLOT_TYPE],
|
59 |
-
plot_tics=self.params[PLOT_TICS],
|
60 |
-
components=self.params[N_COMPONENTS]
|
61 |
-
)
|
62 |
-
|
63 |
-
def get_model_result(self, model_parameters: dict, log: bool = True) -> dict:
|
64 |
-
"""
|
65 |
-
Returns a dict of all the important result values. Used for analysing the different models
|
66 |
-
:param model_parameters: dict
|
67 |
-
The input parameters for the model
|
68 |
-
:param log: bool
|
69 |
-
Enables the log output while running the program (default: True)
|
70 |
-
:return: dict of the results: {MODEL, PROJECTION, EXPLAINED_VAR, INPUT_PARAMS}
|
71 |
-
"""
|
72 |
-
model, projection = self.get_model_and_projection(model_parameters, log=log)
|
73 |
-
try:
|
74 |
-
ex_var = model.explained_variance_ratio_
|
75 |
-
except AttributeError as e:
|
76 |
-
warnings.warn(str(e))
|
77 |
-
ex_var = 0
|
78 |
-
return {MODEL: model, PROJECTION: projection, EXPLAINED_VAR: ex_var, INPUT_PARAMS: model_parameters}
|
79 |
-
|
80 |
-
def get_model_and_projection(self, model_parameters: dict, inp: np.ndarray = None, log: bool = True):
|
81 |
-
"""
|
82 |
-
This method is fitting a model with the given parameters :model_parameters: and
|
83 |
-
the inp(ut) data is transformed on the model.
|
84 |
-
@param model_parameters: dict
|
85 |
-
The input parameters for the model.
|
86 |
-
@param inp: np.ndarray
|
87 |
-
Input data for the model (optional), (default: None -> calculated on the basis of the model_parameters)
|
88 |
-
@param log: bool
|
89 |
-
Enables the log output while running the program (default: True)
|
90 |
-
@return: fitted model and transformed data
|
91 |
-
"""
|
92 |
-
if log:
|
93 |
-
print(f'Running {model_parameters}...')
|
94 |
-
|
95 |
-
if inp is None:
|
96 |
-
inp = insert_missing_data(self.features.feat)
|
97 |
-
|
98 |
-
if ALGORITHM_NAME not in model_parameters.keys():
|
99 |
-
raise KeyError(f'{ALGORITHM_NAME} is a mandatory model parameter.')
|
100 |
-
|
101 |
-
if model_parameters[ALGORITHM_NAME].startswith('normalized'):
|
102 |
-
inp = Normalizer(norm="l2").fit_transform(inp)
|
103 |
-
elif model_parameters[ALGORITHM_NAME].startswith('std_scaled'):
|
104 |
-
scaler = StandardScaler()
|
105 |
-
inp = scaler.fit_transform(inp)
|
106 |
-
try:
|
107 |
-
if 'pca' in model_parameters[ALGORITHM_NAME]:
|
108 |
-
# from sklearn.decomposition import PCA
|
109 |
-
pca = PCA(n_components=self.params[N_COMPONENTS])
|
110 |
-
# pca = coor.pca(data=inp, dim=self.params[N_COMPONENTS])
|
111 |
-
return pca, pca.fit_transform(inp)
|
112 |
-
elif 'tsne' in model_parameters[ALGORITHM_NAME]:
|
113 |
-
tsne = TSNE(n_components=self.params[N_COMPONENTS], learning_rate='auto',
|
114 |
-
init='random', perplexity=self.params[PERPLEXITY])
|
115 |
-
return tsne, tsne.fit_transform(inp)
|
116 |
-
#elif model_parameters[ALGORITHM_NAME] == 'original_ica':
|
117 |
-
# ica = FastICA(n_components=self.params[N_COMPONENTS])
|
118 |
-
# return ica, ica.fit_transform(inp)
|
119 |
-
else:
|
120 |
-
warnings.warn(f'No original algorithm was found with name: {model_parameters[ALGORITHM_NAME]}')
|
121 |
-
except TypeError:
|
122 |
-
raise TypeError(f'Input data of the function is not correct. '
|
123 |
-
f'Original algorithms take only 2-n-dimensional ndarray')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gedi/utils/algorithms/__init__.py
DELETED
@@ -1,67 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
from sklearn.base import TransformerMixin, BaseEstimator
|
3 |
-
|
4 |
-
from utils.param_keys import N_COMPONENTS
|
5 |
-
|
6 |
-
|
7 |
-
class MyModel(TransformerMixin, BaseEstimator):
|
8 |
-
"""
|
9 |
-
This class and some child classes are partly copied from:
|
10 |
-
https://towardsdatascience.com/implementing-pca-from-scratch-fb434f1acbaa
|
11 |
-
and commented with the help of:
|
12 |
-
https://www.askpython.com/python/examples/principal-component-analysis
|
13 |
-
"""
|
14 |
-
def __init__(self):
|
15 |
-
self.explained_variance_ = None
|
16 |
-
self.components_ = None
|
17 |
-
self._standardized_data = None
|
18 |
-
self.n_components = None
|
19 |
-
self.n_samples = None
|
20 |
-
self._covariance_matrix = None
|
21 |
-
|
22 |
-
def __str__(self):
|
23 |
-
return f'{self.__class__.__name__}:\ncomponents={self.n_components}'
|
24 |
-
|
25 |
-
def fit_transform(self, data_ndarray, **fit_params):
|
26 |
-
self.fit(data_ndarray, **fit_params)
|
27 |
-
return self.transform(data_ndarray)
|
28 |
-
|
29 |
-
def fit(self, data_matrix, **fit_params):
|
30 |
-
self.n_samples = data_matrix.shape[0]
|
31 |
-
self.n_components = fit_params.get(N_COMPONENTS, 2)
|
32 |
-
self._standardized_data = self._standardize_data(data_matrix)
|
33 |
-
self._covariance_matrix = self.get_covariance_matrix()
|
34 |
-
self.components_ = self.get_eigenvectors()
|
35 |
-
return self
|
36 |
-
|
37 |
-
@staticmethod
|
38 |
-
def _standardize_data(matrix):
|
39 |
-
"""
|
40 |
-
Subtract mean and divide by standard deviation column-wise.
|
41 |
-
Doing this proves to be very helpful when calculating the covariance matrix.
|
42 |
-
https://towardsdatascience.com/understanding-the-covariance-matrix-92076554ea44
|
43 |
-
Mean-Center the data
|
44 |
-
:param matrix: Data as matrix
|
45 |
-
:return: Standardized data matrix
|
46 |
-
"""
|
47 |
-
numerator = matrix - np.mean(matrix, axis=0)
|
48 |
-
denominator = np.std(matrix, axis=0)
|
49 |
-
return numerator / denominator
|
50 |
-
|
51 |
-
def get_covariance_matrix(self):
|
52 |
-
"""
|
53 |
-
Calculate covariance matrix with standardized matrix A
|
54 |
-
:return: Covariance Matrix
|
55 |
-
"""
|
56 |
-
return np.cov(self._standardized_data.T)
|
57 |
-
|
58 |
-
def get_eigenvectors(self):
|
59 |
-
pass
|
60 |
-
|
61 |
-
def transform(self, data_matrix):
|
62 |
-
"""
|
63 |
-
Project the data to the lower dimension with the help of the eigenvectors.
|
64 |
-
:return: Data reduced to lower dimensions from higher dimensions
|
65 |
-
"""
|
66 |
-
data_matrix_standardized = self._standardize_data(data_matrix)
|
67 |
-
return np.dot(data_matrix_standardized, self.components_[:, :self.n_components])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gedi/utils/algorithms/tsne.py
DELETED
@@ -1,69 +0,0 @@
|
|
1 |
-
from scipy import spatial
|
2 |
-
|
3 |
-
from tag.utils.algorithms import MyModel
|
4 |
-
import sklearn.manifold as sk
|
5 |
-
import numpy as np
|
6 |
-
|
7 |
-
from tag.utils.matrix_tools import ensure_matrix_symmetry
|
8 |
-
|
9 |
-
"""
|
10 |
-
Parts of this file were originally copied from the tltsne python module.
|
11 |
-
https://github.com/spiwokv/tltsne/blob/master/tltsne/__init__.py
|
12 |
-
Since the results in the text file are complicated to reuse, this module was modified somewhat.
|
13 |
-
This way, the results of the models can be used and it's Object Oriented.
|
14 |
-
"""
|
15 |
-
|
16 |
-
|
17 |
-
class MyTSNE(MyModel):
|
18 |
-
def __init__(self, n_components, perplexity=7.0,
|
19 |
-
early_exaggeration=12.0, learning_rate="auto",
|
20 |
-
n_iter=1000, metric="euclidean"):
|
21 |
-
super().__init__()
|
22 |
-
self.model = sk.TSNE(
|
23 |
-
n_components=n_components, perplexity=perplexity,
|
24 |
-
early_exaggeration=early_exaggeration, learning_rate=learning_rate,
|
25 |
-
n_iter=n_iter, metric=metric
|
26 |
-
)
|
27 |
-
|
28 |
-
def fit_transform(self, data_matrix, **fit_params):
|
29 |
-
return self.model.fit_transform(data_matrix, **fit_params)
|
30 |
-
|
31 |
-
|
32 |
-
class MyTimeLaggedTSNE(MyTSNE):
|
33 |
-
def __init__(self, lag_time, **kwargs):
|
34 |
-
super().__init__(metric="precomputed", **kwargs)
|
35 |
-
self.lag_time = lag_time
|
36 |
-
|
37 |
-
def fit_transform(self, data_matrix, **fit_params):
|
38 |
-
data_zero_mean = data_matrix - np.mean(data_matrix, axis=0)
|
39 |
-
cov = np.cov(data_zero_mean.T)
|
40 |
-
eigenvalue, eigenvector = np.linalg.eig(cov)
|
41 |
-
eigenvalue_order = np.argsort(eigenvalue)[::-1]
|
42 |
-
eigenvector = eigenvector[:, eigenvalue_order]
|
43 |
-
eigenvalue = eigenvalue[eigenvalue_order]
|
44 |
-
projection = data_zero_mean.dot(eigenvector) / np.sqrt(eigenvalue)
|
45 |
-
|
46 |
-
n_frames = fit_params.get('n_frames', 0)
|
47 |
-
if self.lag_time <= 0:
|
48 |
-
covariance_matrix = np.dot(
|
49 |
-
projection[:, np.newaxis].T,
|
50 |
-
projection[:, np.newaxis]
|
51 |
-
) / (n_frames - 1)
|
52 |
-
else:
|
53 |
-
covariance_matrix = np.dot(
|
54 |
-
projection[:-self.lag_time, np.newaxis].T,
|
55 |
-
projection[self.lag_time:, np.newaxis]
|
56 |
-
) / (n_frames - self.lag_time - 1)
|
57 |
-
covariance_matrix = ensure_matrix_symmetry(covariance_matrix)
|
58 |
-
|
59 |
-
eigenvalue2, eigenvector2 = np.linalg.eig(covariance_matrix)
|
60 |
-
eigenvalue_order = np.argsort(eigenvalue2)[::-1]
|
61 |
-
eigenvector2 = eigenvector2[:, eigenvalue_order]
|
62 |
-
eigenvalue2 = eigenvalue2[eigenvalue_order]
|
63 |
-
projection = np.dot(
|
64 |
-
projection,
|
65 |
-
eigenvector2[:, :self.n_components]
|
66 |
-
) * np.sqrt(np.real(eigenvalue2[:self.n_components]))
|
67 |
-
data_distance = spatial.distance_matrix(projection, projection)
|
68 |
-
|
69 |
-
return self.model.fit_transform(data_distance)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
CHANGED
@@ -3,7 +3,6 @@ import pandas as pd
|
|
3 |
from datetime import datetime as dt
|
4 |
from gedi.generator import GenerateEventLogs
|
5 |
from gedi.features import EventLogFeatures
|
6 |
-
from gedi.analyser import FeatureAnalyser
|
7 |
from gedi.augmentation import InstanceAugmentator
|
8 |
from gedi.benchmark import BenchmarkTest
|
9 |
from gedi.plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter
|
|
|
3 |
from datetime import datetime as dt
|
4 |
from gedi.generator import GenerateEventLogs
|
5 |
from gedi.features import EventLogFeatures
|
|
|
6 |
from gedi.augmentation import InstanceAugmentator
|
7 |
from gedi.benchmark import BenchmarkTest
|
8 |
from gedi.plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter
|