Spaces:

andreamalhera
/

igedi

Sleeping

App Files Files Community

Andrea Maldonado commited on Jun 21, 2024

Commit

84c5238

1 Parent(s): bb0e977

Compare baseline and GenRT features

Browse files

Files changed (1) hide show

notebooks/feature_performance_similarity.ipynb +135 -6

notebooks/feature_performance_similarity.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
    "id": "b7408494",
    "metadata": {},
    "outputs": [],
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "4ff27cb8",
    "metadata": {},
    "outputs": [
@@ -22,15 +22,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(467, 8) (34, 8)\n",
-      "(501, 9)\n"
      ]
     }
    ],
    "source": [
-    "bpi_ft = pd.read_csv(\"../data/34_bpic_features.csv\").sort_values('log')\n",
-    "gen =pd.read_csv(\"../output/generated/instance_selection_feat.csv\")\n",
     "paper_cols = [\"log\",\"ratio_unique_traces_per_trace\", \"ratio_most_common_variant\", 'ratio_top_10_variants', 'epa_normalized_variant_entropy', 'epa_normalized_sequence_entropy', 'epa_normalized_sequence_entropy_linear_forgetting', 'epa_normalized_sequence_entropy_exponential_forgetting'] \n",
     "bpi_ft= bpi_ft[paper_cols]\n",
     "print(gen.shape, bpi_ft.shape)\n",
     "#print(gen.columns == df.columns)\n",
@@ -41,6 +42,134 @@
     "feature_logs = both_df['log'].unique()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 39,
    "id": "b7408494",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 43,
    "id": "4ff27cb8",
    "metadata": {},
    "outputs": [
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "(9, 91) (34, 91)\n",
+      "(43, 92)\n"
      ]
     }
    ],
    "source": [
+    "bpi_ft = pd.read_csv(\"../../shampu/data/bench_baseline_feat.csv\").sort_values('log')\n",
+    "gen =pd.read_csv(\"../output/features/bench_baseline_feat_nOR_nDup.csv\")\n",
     "paper_cols = [\"log\",\"ratio_unique_traces_per_trace\", \"ratio_most_common_variant\", 'ratio_top_10_variants', 'epa_normalized_variant_entropy', 'epa_normalized_sequence_entropy', 'epa_normalized_sequence_entropy_linear_forgetting', 'epa_normalized_sequence_entropy_exponential_forgetting'] \n",
+    "paper_cols = gen.columns\n",
     "bpi_ft= bpi_ft[paper_cols]\n",
     "print(gen.shape, bpi_ft.shape)\n",
     "#print(gen.columns == df.columns)\n",
     "feature_logs = both_df['log'].unique()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "e3fa569e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>log</th>\n",
+       "      <th>cosine_similarity</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>BPIC13inc</td>\n",
+       "      <td>0.999198</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>BPIC15f2</td>\n",
+       "      <td>0.093858</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>BPIC16q_p</td>\n",
+       "      <td>0.181836</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>BPIC19</td>\n",
+       "      <td>0.865802</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>BPIC20c</td>\n",
+       "      <td>0.673826</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>CSLGWABO2</td>\n",
+       "      <td>0.081622</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>CSLWABO3</td>\n",
+       "      <td>0.154796</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Hospital_log</td>\n",
+       "      <td>0.625071</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>SEPSIS</td>\n",
+       "      <td>0.993330</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            log  cosine_similarity\n",
+       "0     BPIC13inc           0.999198\n",
+       "1      BPIC15f2           0.093858\n",
+       "2     BPIC16q_p           0.181836\n",
+       "3        BPIC19           0.865802\n",
+       "4       BPIC20c           0.673826\n",
+       "5     CSLGWABO2           0.081622\n",
+       "6      CSLWABO3           0.154796\n",
+       "7  Hospital_log           0.625071\n",
+       "8        SEPSIS           0.993330"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def cosine_similarity_df(df1, df2):\n",
+    "    # Ensure both dataframes have the same shape\n",
+    "    if df1.shape != df2.shape:\n",
+    "        raise ValueError(\"The dataframes must have the same shape\")\n",
+    "\n",
+    "    # Compute cosine similarity for each corresponding row\n",
+    "    cosine_similarities = []\n",
+    "    for i in range(len(df1)):\n",
+    "        row1 = df1.iloc[i].values.reshape(1, -1)\n",
+    "        row2 = df2.iloc[i].values.reshape(1, -1)\n",
+    "        similarity = cosine_similarity(row1, row2)[0][0]\n",
+    "        cosine_similarities.append(similarity)\n",
+    "\n",
+    "    # Create a result DataFrame\n",
+    "    result_df = pd.DataFrame({\n",
+    "        'row_index': df1.index,\n",
+    "        'cosine_similarity': cosine_similarities\n",
+    "    })\n",
+    "    return result_df\n",
+    "bpi_ft = bpi_ft.loc[bpi_ft.log.isin(gen.log)]\n",
+    "bpi_ft = bpi_ft.sort_values('log').reset_index(drop=True)\n",
+    "gen = gen.sort_values('log').reset_index(drop=True)\n",
+    "\n",
+    "cos_sim = cosine_similarity_df(bpi_ft.drop(bpi_ft.select_dtypes(include=['object']), axis=1),gen.drop(gen.select_dtypes(include=['object']), axis=1))\n",
+    "cos_sim['log'] = gen['log']\n",
+    "cos_sim[['log', 'cosine_similarity']]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 3,