Andrea Maldonado commited on
Commit
441ee01
Β·
1 Parent(s): 761e409

GEDI Fig.7 and 8

Browse files
data/{baseline_ED_bench.csv β†’ BaselineED_bench.csv} RENAMED
@@ -1,4 +1,4 @@
1
- log,fitness_heuristics,precision_heuristics,fscore_heuristics,size_heuristics,pnsize_heuristics,cfc_heuristics,fitness_ilp,precision_ilp,fscore_ilp,size_ilp,pnsize_ilp,cfc_ilp,fitness_imf,precision_imf,fscore_imf,size_imf,pnsize_imf,cfc_imf
2
  BPIC16wm_p,0.999900004026629,1.0,0.999949999513391,5.0,4.0,2.0,0.9999495832135112,1.0,0.999974790971276,4.0,3.0,1.0,0.999900004026629,1.0,0.999949999513391,5,4,2
3
  BPIC13op,0.990133346397138,0.9620563035495712,0.975892918274616,12.0,7.0,7.0,0.99993033237412,0.9065645824471852,0.950961282086593,10.0,5.0,3.0,0.8513195049834781,0.9065645824471852,0.8780739493381781,17,10,8
4
  BPIC13cp,0.989977119234364,0.8684298767708941,0.925228660364203,14.0,9.0,8.0,0.999955347339294,0.792379879879879,0.8841476594077591,20.0,8.0,6.0,0.990412853232678,0.9470205909661912,0.9682307987170752,15,10,9
 
1
+ log,fitness_heu,precision_heu,fscore_heu,size_heu,pnsize_heu,cfc_heu,fitness_ilp,precision_ilp,fscore_ilp,size_ilp,pnsize_ilp,cfc_ilp,fitness_imf,precision_imf,fscore_imf,size_imf,pnsize_imf,cfc_imf
2
  BPIC16wm_p,0.999900004026629,1.0,0.999949999513391,5.0,4.0,2.0,0.9999495832135112,1.0,0.999974790971276,4.0,3.0,1.0,0.999900004026629,1.0,0.999949999513391,5,4,2
3
  BPIC13op,0.990133346397138,0.9620563035495712,0.975892918274616,12.0,7.0,7.0,0.99993033237412,0.9065645824471852,0.950961282086593,10.0,5.0,3.0,0.8513195049834781,0.9065645824471852,0.8780739493381781,17,10,8
4
  BPIC13cp,0.989977119234364,0.8684298767708941,0.925228660364203,14.0,9.0,8.0,0.999955347339294,0.792379879879879,0.8841476594077591,20.0,8.0,6.0,0.990412853232678,0.9470205909661912,0.9682307987170752,15,10,9
data/{baseline_ED_feat.csv β†’ BaselineED_feat.csv} RENAMED
File without changes
data/{GenBaseline_ED_bench.csv β†’ GenBaselineED_bench.csv} RENAMED
@@ -1,4 +1,4 @@
1
- log,fitness_heuristics,precision_heuristics,fscore_heuristics,size_heuristics,pnsize_heuristics,cfc_heuristics,fitness_ilp,precision_ilp,fscore_ilp,size_ilp,pnsize_ilp,cfc_ilp,fitness_imf,precision_imf,fscore_imf,size_imf,pnsize_imf,cfc_imf
2
  genELBPIC20b_03394_01938_01456_07583_02123_08113_01168,0.6965863019071621,0.8709677419354831,0.7740775519905101,13.0,7.0,5.0,0.999969621176065,0.427355623100303,0.598802049151352,21.0,7.0,6.0,0.99991994157317,0.902439024390243,0.9486819182778732,21,14,11
3
  genELBPIC15f1_06103_03639_02702_06529_00067_01218_09758,0.244571491396844,0.970825492684492,0.390713884832271,48.0,28.0,13.0,0.9999851056034972,0.7639844601581931,0.866197576079873,50.0,34.0,12.0,0.9999702116506732,0.7639844601581931,0.8661919884129461,32,33,4
4
  genELBPIC12_04231_02756_02261_07083_0262_06863_03336,0.938048056994855,0.492925487219797,0.6462562461747551,49.0,30.0,30.0,0.999983354100017,0.128493715326455,0.227725631143263,54.0,10.0,28.0,0.9099497610012792,0.397165646466794,0.552974562177985,48,30,26
 
1
+ log,fitness_heu,precision_heu,fscore_heu,size_heu,pnsize_heu,cfc_heu,fitness_ilp,precision_ilp,fscore_ilp,size_ilp,pnsize_ilp,cfc_ilp,fitness_imf,precision_imf,fscore_imf,size_imf,pnsize_imf,cfc_imf
2
  genELBPIC20b_03394_01938_01456_07583_02123_08113_01168,0.6965863019071621,0.8709677419354831,0.7740775519905101,13.0,7.0,5.0,0.999969621176065,0.427355623100303,0.598802049151352,21.0,7.0,6.0,0.99991994157317,0.902439024390243,0.9486819182778732,21,14,11
3
  genELBPIC15f1_06103_03639_02702_06529_00067_01218_09758,0.244571491396844,0.970825492684492,0.390713884832271,48.0,28.0,13.0,0.9999851056034972,0.7639844601581931,0.866197576079873,50.0,34.0,12.0,0.9999702116506732,0.7639844601581931,0.8661919884129461,32,33,4
4
  genELBPIC12_04231_02756_02261_07083_0262_06863_03336,0.938048056994855,0.492925487219797,0.6462562461747551,49.0,30.0,30.0,0.999983354100017,0.128493715326455,0.227725631143263,54.0,10.0,28.0,0.9099497610012792,0.397165646466794,0.552974562177985,48,30,26
data/{GenBaseline_ED_feat.csv β†’ GenBaselineED_feat.csv} RENAMED
File without changes
data/GenED_bench.csv CHANGED
@@ -1,4 +1,4 @@
1
- log,fitness_heuristics,precision_heuristics,fscore_heuristics,size_heuristics,pnsize_heuristics,cfc_heuristics,fitness_ilp,precision_ilp,fscore_ilp,size_ilp,pnsize_ilp,cfc_ilp,fitness_imf,precision_imf,fscore_imf,size_imf,pnsize_imf,cfc_imf
2
  2_ense_rmcv_genELtask_67_06_00,0.376214776532216,0.994733180959952,0.545948253307299,29.0,18.0,10.0,,,,,,,0.945685191537984,0.507638900441974,0.6606462982975451,28.0,22.0,8.0
3
  2_enself_rmcv_genELtask_13_01_01,0.63263614857424,0.858184089962515,0.7283484130513961,14.0,8.0,7.0,0.99997771174738,0.940229218047294,0.96918349021716,13.0,8.0,3.0,0.95097054618107,0.940229218047294,0.945569378691894,13.0,8.0,4.0
4
  2_rt10v_rutpt_genELtask_1_00_00,0.538653366583541,1.0,0.700162074554294,5.0,4.0,0.0,0.999955489786125,1.0,0.9999777443977612,15.0,8.0,4.0,0.999932932799884,1.0,0.999966465275402,11.0,10.0,2.0
 
1
+ log,fitness_heu,precision_heu,fscore_heu,size_heu,pnsize_heu,cfc_heu,fitness_ilp,precision_ilp,fscore_ilp,size_ilp,pnsize_ilp,cfc_ilp,fitness_imf,precision_imf,fscore_imf,size_imf,pnsize_imf,cfc_imf
2
  2_ense_rmcv_genELtask_67_06_00,0.376214776532216,0.994733180959952,0.545948253307299,29.0,18.0,10.0,,,,,,,0.945685191537984,0.507638900441974,0.6606462982975451,28.0,22.0,8.0
3
  2_enself_rmcv_genELtask_13_01_01,0.63263614857424,0.858184089962515,0.7283484130513961,14.0,8.0,7.0,0.99997771174738,0.940229218047294,0.96918349021716,13.0,8.0,3.0,0.95097054618107,0.940229218047294,0.945569378691894,13.0,8.0,4.0
4
  2_rt10v_rutpt_genELtask_1_00_00,0.538653366583541,1.0,0.700162074554294,5.0,4.0,0.0,0.999955489786125,1.0,0.9999777443977612,15.0,8.0,4.0,0.999932932799884,1.0,0.999966465275402,11.0,10.0,2.0
notebooks/GEDI_statistical_tests.ipynb ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 32,
6
+ "id": "1768477d",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import pandas as pd\n",
11
+ "from scipy import spatial\n",
12
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
13
+ "TEST='kendalltau'\n",
14
+ "DATA_SOURCE = 'BaselineED' #'BaselineED', 'GenBaselineED', 'GenED'\n",
15
+ "IMPUTE = False #If False Nan lines are dropped\n",
16
+ "\n",
17
+ "paper_feat_columns = [\"log\",\"ratio_unique_traces_per_trace\", \"ratio_most_common_variant\", 'ratio_top_10_variants', 'epa_normalized_variant_entropy', 'epa_normalized_sequence_entropy', 'epa_normalized_sequence_entropy_linear_forgetting', 'epa_normalized_sequence_entropy_exponential_forgetting'] \n",
18
+ "paper_metrics_columns = ['log', 'fitness_heu', 'precision_heu',\n",
19
+ " 'fscore_heu', 'size_heu', 'cfc_heu', 'fitness_ilp', 'precision_ilp', 'fscore_ilp',\n",
20
+ " 'size_ilp','cfc_ilp', 'fitness_imf', 'precision_imf', 'fscore_imf', 'size_imf', 'cfc_imf']"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 33,
26
+ "id": "d3b7f2d1",
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "name": "stdout",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "BaselineED\n",
34
+ "kendalltau_BaselineED_nanDropped\n"
35
+ ]
36
+ }
37
+ ],
38
+ "source": [
39
+ "def get_output_file_name(test, data_source, impute): \n",
40
+ " print(data_source)\n",
41
+ " impute = 'imputed' if impute else 'nanDropped'\n",
42
+ " return (\"_\".join([test, data_source, impute]))\n",
43
+ "print(get_output_file_name(TEST, DATA_SOURCE, IMPUTE))"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 34,
49
+ "id": "6594d6b4",
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "## LOAD FEATURE AND METRICS FILES\n",
54
+ "def load_data(data_source, content):\n",
55
+ " path = f\"../data/{data_source}.csv\" \n",
56
+ " print(\"Path: \", path)\n",
57
+ " data = pd.read_csv(path).sort_values('log')\n",
58
+ " if data_source == 'GenBaselineED_feat':\n",
59
+ " data['log']=data.apply(lambda x: \"Gen\"+x['log'], axis=1)\n",
60
+ " elif data_source == 'GenBaselineED_bench':\n",
61
+ " data['log']=data.apply(lambda x: \"Gen\"+x['log'].replace(\"genEL\",\"\").rsplit(\"_\",7)[0], axis=1)\n",
62
+ " return data"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 35,
68
+ "id": "7428d805",
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "### INSTANCE SELECTION: NULLS OR IMPUTATION?\n",
73
+ "import numpy as np\n",
74
+ "import matplotlib.pyplot as plt\n",
75
+ "import seaborn as sns\n",
76
+ "from sklearn.impute import SimpleImputer\n",
77
+ "\n",
78
+ "def clean_data(fd_pdm, impute=False, feat_columns=paper_feat_columns, metric_columns=paper_metrics_columns):\n",
79
+ " num_cols = fd_pdm.convert_dtypes().select_dtypes(exclude=['string']).columns\n",
80
+ " str_cols = fd_pdm.convert_dtypes().select_dtypes(include=['string']).columns\n",
81
+ "\n",
82
+ " imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
83
+ " imputer.fit(fd_pdm.drop(str_cols, axis=1))\n",
84
+ " imp_df = imputer.transform(fd_pdm.drop(str_cols, axis=1))\n",
85
+ " imp_df = pd.DataFrame(imp_df, columns=num_cols)\n",
86
+ " imp_df['log'] = fd_pdm['log']\n",
87
+ " print(\"Imputed dataset:\" ,imp_df.shape)\n",
88
+ "\n",
89
+ " ft_pdm_no_nans = fd_pdm.copy()\n",
90
+ " ft_pdm_no_nans = ft_pdm_no_nans.dropna()\n",
91
+ " ft_pdm_no_nans['log'] = fd_pdm['log']\n",
92
+ " print(\"No nan's dataset:\" ,ft_pdm_no_nans.shape)\n",
93
+ " #print(len(ft_pdm_no_nans[ft_pdm_no_nans['source']==DATA_SOURCE]['log']))\n",
94
+ " print(\"FT_COL: \", feat_columns)\n",
95
+ " print(\"M_COL: \", metric_columns)\n",
96
+ " \n",
97
+ " if IMPUTE:\n",
98
+ " benchmarked_ft = imp_df[feat_columns]\n",
99
+ " benchmarked_pd = imp_df[metric_columns]\n",
100
+ " else:\n",
101
+ " benchmarked_ft = ft_pdm_no_nans[feat_columns]\n",
102
+ " benchmarked_pd = ft_pdm_no_nans[metric_columns]\n",
103
+ " return benchmarked_ft, benchmarked_pd"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 39,
109
+ "id": "14e72f71",
110
+ "metadata": {},
111
+ "outputs": [
112
+ {
113
+ "name": "stdout",
114
+ "output_type": "stream",
115
+ "text": [
116
+ "Path: ../data/BaselineED_feat.csv\n",
117
+ "(26, 8)\n",
118
+ "['BPIC12', 'BPIC13cp', 'BPIC13inc', 'BPIC13op', 'BPIC14dc_p', 'BPIC14di_p', 'BPIC14dia_p', 'BPIC15f1', 'BPIC15f2', 'BPIC15f3', 'BPIC15f4', 'BPIC15f5', 'BPIC16c_p', 'BPIC16wm_p', 'BPIC17', 'BPIC17ol', 'BPIC19', 'BPIC20a', 'BPIC20b', 'BPIC20c', 'BPIC20d', 'BPIC20e', 'HD', 'RTFMP', 'RWABOCSL', 'SEPSIS']\n",
119
+ "Path: ../data/BaselineED_bench.csv\n",
120
+ "(17, 19)\n",
121
+ "['BPIC13cp', 'BPIC13inc', 'BPIC13op', 'BPIC14dc_p', 'BPIC14di_p', 'BPIC16c_p', 'BPIC16wm_p', 'BPIC17ol', 'BPIC20a', 'BPIC20b', 'BPIC20c', 'BPIC20d', 'BPIC20e', 'HD', 'RTFMP', 'RWABOCSL', 'SEPSIS']\n",
122
+ "(17, 26)\n",
123
+ "Index(['log', 'ratio_unique_traces_per_trace', 'ratio_most_common_variant',\n",
124
+ " 'ratio_top_10_variants', 'epa_normalized_variant_entropy',\n",
125
+ " 'epa_normalized_sequence_entropy',\n",
126
+ " 'epa_normalized_sequence_entropy_linear_forgetting',\n",
127
+ " 'epa_normalized_sequence_entropy_exponential_forgetting', 'fitness_heu',\n",
128
+ " 'precision_heu', 'fscore_heu', 'size_heu', 'pnsize_heu', 'cfc_heu',\n",
129
+ " 'fitness_ilp', 'precision_ilp', 'fscore_ilp', 'size_ilp', 'pnsize_ilp',\n",
130
+ " 'cfc_ilp', 'fitness_imf', 'precision_imf', 'fscore_imf', 'size_imf',\n",
131
+ " 'pnsize_imf', 'cfc_imf'],\n",
132
+ " dtype='object')\n",
133
+ "Imputed dataset: (17, 26)\n",
134
+ "No nan's dataset: (14, 26)\n",
135
+ "FT_COL: ['log', 'ratio_unique_traces_per_trace', 'ratio_most_common_variant', 'ratio_top_10_variants', 'epa_normalized_variant_entropy', 'epa_normalized_sequence_entropy', 'epa_normalized_sequence_entropy_linear_forgetting', 'epa_normalized_sequence_entropy_exponential_forgetting']\n",
136
+ "M_COL: ['log', 'ratio_unique_traces_per_trace', 'ratio_most_common_variant', 'ratio_top_10_variants', 'epa_normalized_variant_entropy', 'epa_normalized_sequence_entropy', 'epa_normalized_sequence_entropy_linear_forgetting', 'epa_normalized_sequence_entropy_exponential_forgetting']\n",
137
+ "(14, 8) (14, 8)\n",
138
+ "BaselineED (14, 8) (14, 8)\n",
139
+ "['rutpt', 'rmcv', 'rt10v', 'enve', 'ense', 'enself', 'enseef']\n",
140
+ "Direct kendalltau BaselineED\n",
141
+ "BaselineED\n",
142
+ "../output/plots/pdm_kendalltau_BaselineED_nanDropped\n"
143
+ ]
144
+ },
145
+ {
146
+ "data": {
147
+ "image/png": "\n",
148
+ "text/plain": [
149
+ "<Figure size 432x288 with 2 Axes>"
150
+ ]
151
+ },
152
+ "metadata": {
153
+ "needs_background": "light"
154
+ },
155
+ "output_type": "display_data"
156
+ }
157
+ ],
158
+ "source": [
159
+ "from scipy.stats import spearmanr\n",
160
+ "from scipy.stats import kendalltau\n",
161
+ "from scipy.stats import pearsonr\n",
162
+ "from numpy import isnan\n",
163
+ "\n",
164
+ "import sys\n",
165
+ "import os\n",
166
+ "sys.path.append(os.path.dirname(\"../gedi/utils/io_helpers.py\"))\n",
167
+ "from io_helpers import get_keys_abbreviation\n",
168
+ "\n",
169
+ "def statistical_test(feature_source, bench_source, test, impute=False):\n",
170
+ " ft = load_data(feature_source, 'feat')\n",
171
+ " #paper_feat_columns = [\"log\",\"ratio_unique_traces_per_trace\", \"ratio_most_common_variant\", 'ratio_top_10_variants', 'epa_normalized_variant_entropy', 'epa_normalized_sequence_entropy', 'epa_normalized_sequence_entropy_linear_forgetting', 'epa_normalized_sequence_entropy_exponential_forgetting'] \n",
172
+ " #ft= ft[paper_feat_columns]\n",
173
+ " print(ft.shape)\n",
174
+ " print(ft['log'].tolist())\n",
175
+ "\n",
176
+ "\n",
177
+ " ben = load_data(bench_source, 'bench')\n",
178
+ " #ben['log']=ben.apply(lambda x: x['log'].replace(\"Gen\",\"\"), axis=1)\n",
179
+ " '''\n",
180
+ " paper_metrics_columns = ['log', 'fitness_heu', 'precision_heu',\n",
181
+ " 'fscore_heu', 'size_heu', 'cfc_heu', 'fitness_ilp', 'precision_ilp', 'fscore_ilp',\n",
182
+ " 'size_ilp','cfc_ilp', 'fitness_imf', 'precision_imf', 'fscore_imf', 'size_imf', 'cfc_imf']\n",
183
+ " '''\n",
184
+ " #ben = ben[paper_metrics_columns]\n",
185
+ " print(ben.shape)\n",
186
+ " print(ben['log'].tolist())\n",
187
+ " fd_pdm = pd.merge(ft, ben, on=['log'], how='inner').reset_index(drop=True)#.reindex(both_df.index)\n",
188
+ "\n",
189
+ " ## DROP DUPLICATES\n",
190
+ " fd_pdm = fd_pdm.reset_index(drop=True)\n",
191
+ " fd_pdm = fd_pdm.T.drop_duplicates().T\n",
192
+ " print(fd_pdm.shape)\n",
193
+ " fd_pdm['log'].unique()\n",
194
+ " \n",
195
+ " print(fd_pdm.columns)\n",
196
+ " benchmark_ft, benchmark_pd = clean_data(fd_pdm, impute, paper_feat_columns, paper_feat_columns)\n",
197
+ " \n",
198
+ " print(benchmark_ft.shape, benchmark_pd.shape)\n",
199
+ "\n",
200
+ " benchmarked_ft_plot = benchmark_ft.copy()\n",
201
+ " benchmarked_pdm_plot = benchmark_pd.copy()\n",
202
+ "\n",
203
+ " #benchmarked_ft = benchmarked_ft.head(10)\n",
204
+ " #benchmarked_pdm = benchmarked_pdm.head(10)\n",
205
+ " print(DATA_SOURCE, benchmarked_ft_plot.shape, benchmarked_pdm_plot.shape)\n",
206
+ "\n",
207
+ " tmp = list(benchmarked_ft_plot.columns[1:-1])\n",
208
+ " df_tmp = pd.DataFrame(index=benchmarked_pdm_plot.columns[1:-1], columns=tmp)\n",
209
+ " #print(\"Benchmark_pdm:\", benchmarked_pdm.columns[1:-1])\n",
210
+ " #print (\"Benchmark_ft:\", tmp)\n",
211
+ "\n",
212
+ " for feature in benchmarked_ft_plot.columns:\n",
213
+ " if feature != 'log' and feature != 'source':\n",
214
+ " for metric in benchmarked_pdm_plot.columns:\n",
215
+ " if metric != 'log' and metric != 'source':\n",
216
+ " #print(feature, benchmarked_pdm.columns[1])\n",
217
+ " stat, p = eval(f\"{TEST}(benchmarked_ft_plot[feature], benchmarked_pdm_plot[metric])\") \n",
218
+ " #print(feature, metric, p, p <= 0.05)\n",
219
+ " df_tmp.loc[metric, feature] = stat*(1.0 if (p <= 0.05) else 0.0)\n",
220
+ "\n",
221
+ " feature_keys = get_keys_abbreviation(df_tmp.columns).split(\"_\")\n",
222
+ " print(feature_keys)\n",
223
+ " df_tmp.columns=feature_keys\n",
224
+ " print(\"Direct\", TEST, DATA_SOURCE)\n",
225
+ " # df_tmp[pd.isnan()]\n",
226
+ "\n",
227
+ " sns.heatmap(df_tmp.fillna(0), annot=True, cmap=\"viridis\", annot_kws={\"size\": 9})\n",
228
+ " ax = plt.gca()\n",
229
+ " sns.heatmap(df_tmp.fillna(0), mask=df_tmp.fillna(0)!=0, cmap=\"Greys\", annot=False, cbar=False, ax=ax)\n",
230
+ " #ax.set_title(\"P-values of features leading to process discovery metrics\", fontsize=15)\n",
231
+ " plt.tight_layout()\n",
232
+ " output_path = f\"../output/plots/pdm_{get_output_file_name(TEST, DATA_SOURCE, IMPUTE)}\"\n",
233
+ " print(output_path)\n",
234
+ " plt.savefig(output_path, dpi=300)\n",
235
+ "\n",
236
+ "statistical_test(DATA_SOURCE+\"_feat\", DATA_SOURCE+\"_bench\", TEST, IMPUTE)"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "code",
241
+ "execution_count": null,
242
+ "id": "5fc91e8f",
243
+ "metadata": {},
244
+ "outputs": [],
245
+ "source": [
246
+ "\n",
247
+ "\n"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "markdown",
252
+ "id": "07370d54",
253
+ "metadata": {},
254
+ "source": [
255
+ "## Statistical test: Is there a statistical significant relation between feature similarity and performance metrics?"
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "code",
260
+ "execution_count": null,
261
+ "id": "37470503",
262
+ "metadata": {},
263
+ "outputs": [],
264
+ "source": [
265
+ "### DIRECT STATISTICAL TEST\n",
266
+ "from scipy.stats import spearmanr\n",
267
+ "from scipy.stats import kendalltau\n",
268
+ "from scipy.stats import pearsonr\n",
269
+ "from numpy import isnan\n",
270
+ "\n",
271
+ "import sys\n",
272
+ "import os\n",
273
+ "sys.path.append(os.path.dirname(\"../gedi/utils/io_helpers.py\"))\n",
274
+ "from io_helpers import get_keys_abbreviation\n",
275
+ "\n"
276
+ ]
277
+ },
278
+ {
279
+ "cell_type": "code",
280
+ "execution_count": null,
281
+ "id": "f6ae0fd0",
282
+ "metadata": {},
283
+ "outputs": [],
284
+ "source": []
285
+ },
286
+ {
287
+ "cell_type": "code",
288
+ "execution_count": null,
289
+ "id": "3d381199",
290
+ "metadata": {},
291
+ "outputs": [],
292
+ "source": []
293
+ }
294
+ ],
295
+ "metadata": {
296
+ "kernelspec": {
297
+ "display_name": "Python 3 (ipykernel)",
298
+ "language": "python",
299
+ "name": "python3"
300
+ },
301
+ "language_info": {
302
+ "codemirror_mode": {
303
+ "name": "ipython",
304
+ "version": 3
305
+ },
306
+ "file_extension": ".py",
307
+ "mimetype": "text/x-python",
308
+ "name": "python",
309
+ "nbconvert_exporter": "python",
310
+ "pygments_lexer": "ipython3",
311
+ "version": "3.9.19"
312
+ }
313
+ },
314
+ "nbformat": 4,
315
+ "nbformat_minor": 5
316
+ }
notebooks/benchmarking_process_discovery.ipynb CHANGED
The diff for this file is too large to render. See raw diff