Spaces:
Sleeping
Sleeping
Andrea Maldonado
commited on
Commit
·
84c5238
1
Parent(s):
bb0e977
Compare baseline and GenRT features
Browse files
notebooks/feature_performance_similarity.ipynb
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"id": "b7408494",
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
@@ -14,7 +14,7 @@
|
|
14 |
},
|
15 |
{
|
16 |
"cell_type": "code",
|
17 |
-
"execution_count":
|
18 |
"id": "4ff27cb8",
|
19 |
"metadata": {},
|
20 |
"outputs": [
|
@@ -22,15 +22,16 @@
|
|
22 |
"name": "stdout",
|
23 |
"output_type": "stream",
|
24 |
"text": [
|
25 |
-
"(
|
26 |
-
"(
|
27 |
]
|
28 |
}
|
29 |
],
|
30 |
"source": [
|
31 |
-
"bpi_ft = pd.read_csv(\"
|
32 |
-
"gen =pd.read_csv(\"../output/
|
33 |
"paper_cols = [\"log\",\"ratio_unique_traces_per_trace\", \"ratio_most_common_variant\", 'ratio_top_10_variants', 'epa_normalized_variant_entropy', 'epa_normalized_sequence_entropy', 'epa_normalized_sequence_entropy_linear_forgetting', 'epa_normalized_sequence_entropy_exponential_forgetting'] \n",
|
|
|
34 |
"bpi_ft= bpi_ft[paper_cols]\n",
|
35 |
"print(gen.shape, bpi_ft.shape)\n",
|
36 |
"#print(gen.columns == df.columns)\n",
|
@@ -41,6 +42,134 @@
|
|
41 |
"feature_logs = both_df['log'].unique()"
|
42 |
]
|
43 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
{
|
45 |
"cell_type": "code",
|
46 |
"execution_count": 3,
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 39,
|
6 |
"id": "b7408494",
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
|
|
14 |
},
|
15 |
{
|
16 |
"cell_type": "code",
|
17 |
+
"execution_count": 43,
|
18 |
"id": "4ff27cb8",
|
19 |
"metadata": {},
|
20 |
"outputs": [
|
|
|
22 |
"name": "stdout",
|
23 |
"output_type": "stream",
|
24 |
"text": [
|
25 |
+
"(9, 91) (34, 91)\n",
|
26 |
+
"(43, 92)\n"
|
27 |
]
|
28 |
}
|
29 |
],
|
30 |
"source": [
|
31 |
+
"bpi_ft = pd.read_csv(\"../../shampu/data/bench_baseline_feat.csv\").sort_values('log')\n",
|
32 |
+
"gen =pd.read_csv(\"../output/features/bench_baseline_feat_nOR_nDup.csv\")\n",
|
33 |
"paper_cols = [\"log\",\"ratio_unique_traces_per_trace\", \"ratio_most_common_variant\", 'ratio_top_10_variants', 'epa_normalized_variant_entropy', 'epa_normalized_sequence_entropy', 'epa_normalized_sequence_entropy_linear_forgetting', 'epa_normalized_sequence_entropy_exponential_forgetting'] \n",
|
34 |
+
"paper_cols = gen.columns\n",
|
35 |
"bpi_ft= bpi_ft[paper_cols]\n",
|
36 |
"print(gen.shape, bpi_ft.shape)\n",
|
37 |
"#print(gen.columns == df.columns)\n",
|
|
|
42 |
"feature_logs = both_df['log'].unique()"
|
43 |
]
|
44 |
},
|
45 |
+
{
|
46 |
+
"cell_type": "code",
|
47 |
+
"execution_count": 44,
|
48 |
+
"id": "e3fa569e",
|
49 |
+
"metadata": {},
|
50 |
+
"outputs": [
|
51 |
+
{
|
52 |
+
"data": {
|
53 |
+
"text/html": [
|
54 |
+
"<div>\n",
|
55 |
+
"<style scoped>\n",
|
56 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
57 |
+
" vertical-align: middle;\n",
|
58 |
+
" }\n",
|
59 |
+
"\n",
|
60 |
+
" .dataframe tbody tr th {\n",
|
61 |
+
" vertical-align: top;\n",
|
62 |
+
" }\n",
|
63 |
+
"\n",
|
64 |
+
" .dataframe thead th {\n",
|
65 |
+
" text-align: right;\n",
|
66 |
+
" }\n",
|
67 |
+
"</style>\n",
|
68 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
69 |
+
" <thead>\n",
|
70 |
+
" <tr style=\"text-align: right;\">\n",
|
71 |
+
" <th></th>\n",
|
72 |
+
" <th>log</th>\n",
|
73 |
+
" <th>cosine_similarity</th>\n",
|
74 |
+
" </tr>\n",
|
75 |
+
" </thead>\n",
|
76 |
+
" <tbody>\n",
|
77 |
+
" <tr>\n",
|
78 |
+
" <th>0</th>\n",
|
79 |
+
" <td>BPIC13inc</td>\n",
|
80 |
+
" <td>0.999198</td>\n",
|
81 |
+
" </tr>\n",
|
82 |
+
" <tr>\n",
|
83 |
+
" <th>1</th>\n",
|
84 |
+
" <td>BPIC15f2</td>\n",
|
85 |
+
" <td>0.093858</td>\n",
|
86 |
+
" </tr>\n",
|
87 |
+
" <tr>\n",
|
88 |
+
" <th>2</th>\n",
|
89 |
+
" <td>BPIC16q_p</td>\n",
|
90 |
+
" <td>0.181836</td>\n",
|
91 |
+
" </tr>\n",
|
92 |
+
" <tr>\n",
|
93 |
+
" <th>3</th>\n",
|
94 |
+
" <td>BPIC19</td>\n",
|
95 |
+
" <td>0.865802</td>\n",
|
96 |
+
" </tr>\n",
|
97 |
+
" <tr>\n",
|
98 |
+
" <th>4</th>\n",
|
99 |
+
" <td>BPIC20c</td>\n",
|
100 |
+
" <td>0.673826</td>\n",
|
101 |
+
" </tr>\n",
|
102 |
+
" <tr>\n",
|
103 |
+
" <th>5</th>\n",
|
104 |
+
" <td>CSLGWABO2</td>\n",
|
105 |
+
" <td>0.081622</td>\n",
|
106 |
+
" </tr>\n",
|
107 |
+
" <tr>\n",
|
108 |
+
" <th>6</th>\n",
|
109 |
+
" <td>CSLWABO3</td>\n",
|
110 |
+
" <td>0.154796</td>\n",
|
111 |
+
" </tr>\n",
|
112 |
+
" <tr>\n",
|
113 |
+
" <th>7</th>\n",
|
114 |
+
" <td>Hospital_log</td>\n",
|
115 |
+
" <td>0.625071</td>\n",
|
116 |
+
" </tr>\n",
|
117 |
+
" <tr>\n",
|
118 |
+
" <th>8</th>\n",
|
119 |
+
" <td>SEPSIS</td>\n",
|
120 |
+
" <td>0.993330</td>\n",
|
121 |
+
" </tr>\n",
|
122 |
+
" </tbody>\n",
|
123 |
+
"</table>\n",
|
124 |
+
"</div>"
|
125 |
+
],
|
126 |
+
"text/plain": [
|
127 |
+
" log cosine_similarity\n",
|
128 |
+
"0 BPIC13inc 0.999198\n",
|
129 |
+
"1 BPIC15f2 0.093858\n",
|
130 |
+
"2 BPIC16q_p 0.181836\n",
|
131 |
+
"3 BPIC19 0.865802\n",
|
132 |
+
"4 BPIC20c 0.673826\n",
|
133 |
+
"5 CSLGWABO2 0.081622\n",
|
134 |
+
"6 CSLWABO3 0.154796\n",
|
135 |
+
"7 Hospital_log 0.625071\n",
|
136 |
+
"8 SEPSIS 0.993330"
|
137 |
+
]
|
138 |
+
},
|
139 |
+
"execution_count": 44,
|
140 |
+
"metadata": {},
|
141 |
+
"output_type": "execute_result"
|
142 |
+
}
|
143 |
+
],
|
144 |
+
"source": [
|
145 |
+
"def cosine_similarity_df(df1, df2):\n",
|
146 |
+
" # Ensure both dataframes have the same shape\n",
|
147 |
+
" if df1.shape != df2.shape:\n",
|
148 |
+
" raise ValueError(\"The dataframes must have the same shape\")\n",
|
149 |
+
"\n",
|
150 |
+
" # Compute cosine similarity for each corresponding row\n",
|
151 |
+
" cosine_similarities = []\n",
|
152 |
+
" for i in range(len(df1)):\n",
|
153 |
+
" row1 = df1.iloc[i].values.reshape(1, -1)\n",
|
154 |
+
" row2 = df2.iloc[i].values.reshape(1, -1)\n",
|
155 |
+
" similarity = cosine_similarity(row1, row2)[0][0]\n",
|
156 |
+
" cosine_similarities.append(similarity)\n",
|
157 |
+
"\n",
|
158 |
+
" # Create a result DataFrame\n",
|
159 |
+
" result_df = pd.DataFrame({\n",
|
160 |
+
" 'row_index': df1.index,\n",
|
161 |
+
" 'cosine_similarity': cosine_similarities\n",
|
162 |
+
" })\n",
|
163 |
+
" return result_df\n",
|
164 |
+
"bpi_ft = bpi_ft.loc[bpi_ft.log.isin(gen.log)]\n",
|
165 |
+
"bpi_ft = bpi_ft.sort_values('log').reset_index(drop=True)\n",
|
166 |
+
"gen = gen.sort_values('log').reset_index(drop=True)\n",
|
167 |
+
"\n",
|
168 |
+
"cos_sim = cosine_similarity_df(bpi_ft.drop(bpi_ft.select_dtypes(include=['object']), axis=1),gen.drop(gen.select_dtypes(include=['object']), axis=1))\n",
|
169 |
+
"cos_sim['log'] = gen['log']\n",
|
170 |
+
"cos_sim[['log', 'cosine_similarity']]"
|
171 |
+
]
|
172 |
+
},
|
173 |
{
|
174 |
"cell_type": "code",
|
175 |
"execution_count": 3,
|