Andrea Maldonado commited on
Commit
84c5238
·
1 Parent(s): bb0e977

Compare baseline and GenRT features

Browse files
notebooks/feature_performance_similarity.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "id": "b7408494",
7
  "metadata": {},
8
  "outputs": [],
@@ -14,7 +14,7 @@
14
  },
15
  {
16
  "cell_type": "code",
17
- "execution_count": 2,
18
  "id": "4ff27cb8",
19
  "metadata": {},
20
  "outputs": [
@@ -22,15 +22,16 @@
22
  "name": "stdout",
23
  "output_type": "stream",
24
  "text": [
25
- "(467, 8) (34, 8)\n",
26
- "(501, 9)\n"
27
  ]
28
  }
29
  ],
30
  "source": [
31
- "bpi_ft = pd.read_csv(\"../data/34_bpic_features.csv\").sort_values('log')\n",
32
- "gen =pd.read_csv(\"../output/generated/instance_selection_feat.csv\")\n",
33
  "paper_cols = [\"log\",\"ratio_unique_traces_per_trace\", \"ratio_most_common_variant\", 'ratio_top_10_variants', 'epa_normalized_variant_entropy', 'epa_normalized_sequence_entropy', 'epa_normalized_sequence_entropy_linear_forgetting', 'epa_normalized_sequence_entropy_exponential_forgetting'] \n",
 
34
  "bpi_ft= bpi_ft[paper_cols]\n",
35
  "print(gen.shape, bpi_ft.shape)\n",
36
  "#print(gen.columns == df.columns)\n",
@@ -41,6 +42,134 @@
41
  "feature_logs = both_df['log'].unique()"
42
  ]
43
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  {
45
  "cell_type": "code",
46
  "execution_count": 3,
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 39,
6
  "id": "b7408494",
7
  "metadata": {},
8
  "outputs": [],
 
14
  },
15
  {
16
  "cell_type": "code",
17
+ "execution_count": 43,
18
  "id": "4ff27cb8",
19
  "metadata": {},
20
  "outputs": [
 
22
  "name": "stdout",
23
  "output_type": "stream",
24
  "text": [
25
+ "(9, 91) (34, 91)\n",
26
+ "(43, 92)\n"
27
  ]
28
  }
29
  ],
30
  "source": [
31
+ "bpi_ft = pd.read_csv(\"../../shampu/data/bench_baseline_feat.csv\").sort_values('log')\n",
32
+ "gen =pd.read_csv(\"../output/features/bench_baseline_feat_nOR_nDup.csv\")\n",
33
  "paper_cols = [\"log\",\"ratio_unique_traces_per_trace\", \"ratio_most_common_variant\", 'ratio_top_10_variants', 'epa_normalized_variant_entropy', 'epa_normalized_sequence_entropy', 'epa_normalized_sequence_entropy_linear_forgetting', 'epa_normalized_sequence_entropy_exponential_forgetting'] \n",
34
+ "paper_cols = gen.columns\n",
35
  "bpi_ft= bpi_ft[paper_cols]\n",
36
  "print(gen.shape, bpi_ft.shape)\n",
37
  "#print(gen.columns == df.columns)\n",
 
42
  "feature_logs = both_df['log'].unique()"
43
  ]
44
  },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 44,
48
+ "id": "e3fa569e",
49
+ "metadata": {},
50
+ "outputs": [
51
+ {
52
+ "data": {
53
+ "text/html": [
54
+ "<div>\n",
55
+ "<style scoped>\n",
56
+ " .dataframe tbody tr th:only-of-type {\n",
57
+ " vertical-align: middle;\n",
58
+ " }\n",
59
+ "\n",
60
+ " .dataframe tbody tr th {\n",
61
+ " vertical-align: top;\n",
62
+ " }\n",
63
+ "\n",
64
+ " .dataframe thead th {\n",
65
+ " text-align: right;\n",
66
+ " }\n",
67
+ "</style>\n",
68
+ "<table border=\"1\" class=\"dataframe\">\n",
69
+ " <thead>\n",
70
+ " <tr style=\"text-align: right;\">\n",
71
+ " <th></th>\n",
72
+ " <th>log</th>\n",
73
+ " <th>cosine_similarity</th>\n",
74
+ " </tr>\n",
75
+ " </thead>\n",
76
+ " <tbody>\n",
77
+ " <tr>\n",
78
+ " <th>0</th>\n",
79
+ " <td>BPIC13inc</td>\n",
80
+ " <td>0.999198</td>\n",
81
+ " </tr>\n",
82
+ " <tr>\n",
83
+ " <th>1</th>\n",
84
+ " <td>BPIC15f2</td>\n",
85
+ " <td>0.093858</td>\n",
86
+ " </tr>\n",
87
+ " <tr>\n",
88
+ " <th>2</th>\n",
89
+ " <td>BPIC16q_p</td>\n",
90
+ " <td>0.181836</td>\n",
91
+ " </tr>\n",
92
+ " <tr>\n",
93
+ " <th>3</th>\n",
94
+ " <td>BPIC19</td>\n",
95
+ " <td>0.865802</td>\n",
96
+ " </tr>\n",
97
+ " <tr>\n",
98
+ " <th>4</th>\n",
99
+ " <td>BPIC20c</td>\n",
100
+ " <td>0.673826</td>\n",
101
+ " </tr>\n",
102
+ " <tr>\n",
103
+ " <th>5</th>\n",
104
+ " <td>CSLGWABO2</td>\n",
105
+ " <td>0.081622</td>\n",
106
+ " </tr>\n",
107
+ " <tr>\n",
108
+ " <th>6</th>\n",
109
+ " <td>CSLWABO3</td>\n",
110
+ " <td>0.154796</td>\n",
111
+ " </tr>\n",
112
+ " <tr>\n",
113
+ " <th>7</th>\n",
114
+ " <td>Hospital_log</td>\n",
115
+ " <td>0.625071</td>\n",
116
+ " </tr>\n",
117
+ " <tr>\n",
118
+ " <th>8</th>\n",
119
+ " <td>SEPSIS</td>\n",
120
+ " <td>0.993330</td>\n",
121
+ " </tr>\n",
122
+ " </tbody>\n",
123
+ "</table>\n",
124
+ "</div>"
125
+ ],
126
+ "text/plain": [
127
+ " log cosine_similarity\n",
128
+ "0 BPIC13inc 0.999198\n",
129
+ "1 BPIC15f2 0.093858\n",
130
+ "2 BPIC16q_p 0.181836\n",
131
+ "3 BPIC19 0.865802\n",
132
+ "4 BPIC20c 0.673826\n",
133
+ "5 CSLGWABO2 0.081622\n",
134
+ "6 CSLWABO3 0.154796\n",
135
+ "7 Hospital_log 0.625071\n",
136
+ "8 SEPSIS 0.993330"
137
+ ]
138
+ },
139
+ "execution_count": 44,
140
+ "metadata": {},
141
+ "output_type": "execute_result"
142
+ }
143
+ ],
144
+ "source": [
145
+ "def cosine_similarity_df(df1, df2):\n",
146
+ " # Ensure both dataframes have the same shape\n",
147
+ " if df1.shape != df2.shape:\n",
148
+ " raise ValueError(\"The dataframes must have the same shape\")\n",
149
+ "\n",
150
+ " # Compute cosine similarity for each corresponding row\n",
151
+ " cosine_similarities = []\n",
152
+ " for i in range(len(df1)):\n",
153
+ " row1 = df1.iloc[i].values.reshape(1, -1)\n",
154
+ " row2 = df2.iloc[i].values.reshape(1, -1)\n",
155
+ " similarity = cosine_similarity(row1, row2)[0][0]\n",
156
+ " cosine_similarities.append(similarity)\n",
157
+ "\n",
158
+ " # Create a result DataFrame\n",
159
+ " result_df = pd.DataFrame({\n",
160
+ " 'row_index': df1.index,\n",
161
+ " 'cosine_similarity': cosine_similarities\n",
162
+ " })\n",
163
+ " return result_df\n",
164
+ "bpi_ft = bpi_ft.loc[bpi_ft.log.isin(gen.log)]\n",
165
+ "bpi_ft = bpi_ft.sort_values('log').reset_index(drop=True)\n",
166
+ "gen = gen.sort_values('log').reset_index(drop=True)\n",
167
+ "\n",
168
+ "cos_sim = cosine_similarity_df(bpi_ft.drop(bpi_ft.select_dtypes(include=['object']), axis=1),gen.drop(gen.select_dtypes(include=['object']), axis=1))\n",
169
+ "cos_sim['log'] = gen['log']\n",
170
+ "cos_sim[['log', 'cosine_similarity']]"
171
+ ]
172
+ },
173
  {
174
  "cell_type": "code",
175
  "execution_count": 3,