Spaces:

erjonb
/

secom

Sleeping

App Files Files Community

erjonb commited on May 23, 2023

Commit

8fecf57

1 Parent(s): dea2a5b

Upload P2 - Secom Notebook - Mercury.ipynb

Browse files

Files changed (1) hide show

P2 - Secom Notebook - Mercury.ipynb +147 -43

P2 - Secom Notebook - Mercury.ipynb CHANGED Viewed

@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -53,7 +53,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -64,7 +64,7 @@
      "data": {
       "application/mercury+json": {
        "allow_download": true,
-       "code_uid": "App.0.40.24.1-randef62ebb1",
        "continuous_update": false,
        "description": "Recumpute everything dynamically",
        "full_screen": true,
@@ -96,7 +96,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -104,6 +104,7 @@
    },
    "outputs": [],
    "source": [
     "# Read the features data from the the url of csv into pandas dataframes and rename the columns to F1, F2, F3, etc.\n",
     "# Read the labels data from the url of csv into pandas dataframes and rename the columns to pass/fail and date/time\n",
     "\n",
@@ -137,7 +138,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -194,7 +195,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -289,7 +290,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -340,7 +341,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -418,7 +419,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -484,6 +485,74 @@
     "        return df_imputed\n"
    ]
   },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -498,7 +567,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -569,7 +638,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -658,7 +727,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -747,7 +816,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -761,17 +830,17 @@
         "yes",
         "no"
        ],
-       "code_uid": "Select.0.40.16.25-rand77ec76da",
        "disabled": false,
        "hidden": false,
        "label": "Drop Duplicates",
-       "model_id": "3287bcba5d3e42019072b9ba3c8cee67",
        "url_key": "",
        "value": "yes",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3287bcba5d3e42019072b9ba3c8cee67",
        "version_major": 2,
        "version_minor": 0
       },
@@ -785,18 +854,18 @@
     {
      "data": {
       "application/mercury+json": {
-       "code_uid": "Text.0.40.15.28-rand0b79dd2f",
        "disabled": false,
        "hidden": false,
        "label": "Missing Value Threeshold",
-       "model_id": "2927629f747d41209e38703c8dddc4cb",
        "rows": 1,
        "url_key": "",
        "value": "80",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2927629f747d41209e38703c8dddc4cb",
        "version_major": 2,
        "version_minor": 0
       },
@@ -810,18 +879,18 @@
     {
      "data": {
       "application/mercury+json": {
-       "code_uid": "Text.0.40.15.31-rande8db764a",
        "disabled": false,
        "hidden": false,
        "label": "Variance Threshold",
-       "model_id": "0832e29dacb44b0a8da9eccc6702999f",
        "rows": 1,
        "url_key": "",
        "value": "0",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0832e29dacb44b0a8da9eccc6702999f",
        "version_major": 2,
        "version_minor": 0
       },
@@ -835,18 +904,18 @@
     {
      "data": {
       "application/mercury+json": {
-       "code_uid": "Text.0.40.15.34-randb2ccd11d",
        "disabled": false,
        "hidden": false,
        "label": "Correlation Threshold",
-       "model_id": "b4688cca688c46b8a77ce5c0fc8a808f",
        "rows": 1,
        "url_key": "",
        "value": "1",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b4688cca688c46b8a77ce5c0fc8a808f",
        "version_major": 2,
        "version_minor": 0
       },
@@ -866,17 +935,17 @@
         4,
         5
        ],
-       "code_uid": "Select.0.40.16.38-rand5391f652",
        "disabled": false,
        "hidden": false,
        "label": "Outlier Removal Threshold",
-       "model_id": "8aa78a5763854991bff8c4e0ce199acc",
        "url_key": "",
        "value": "none",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8aa78a5763854991bff8c4e0ce199acc",
        "version_major": 2,
        "version_minor": 0
       },
@@ -897,17 +966,17 @@
         "minmax",
         "robust"
        ],
-       "code_uid": "Select.0.40.16.46-rand581c3f74",
        "disabled": false,
        "hidden": false,
        "label": "Scaling Variables",
-       "model_id": "9c7f7fe0460f45f5bdbf5b93e0e7e185",
        "url_key": "",
        "value": "none",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9c7f7fe0460f45f5bdbf5b93e0e7e185",
        "version_major": 2,
        "version_minor": 0
       },
@@ -927,17 +996,48 @@
         "knn",
         "most_frequent"
        ],
-       "code_uid": "Select.0.40.16.50-randd879e6bf",
        "disabled": false,
        "hidden": false,
        "label": "Imputation Methods",
-       "model_id": "c67be681353d4115a0f4f2df41ba8725",
        "url_key": "",
        "value": "mean",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c67be681353d4115a0f4f2df41ba8725",
        "version_major": 2,
        "version_minor": 0
       },
@@ -957,17 +1057,17 @@
         "undersampling",
         "rose"
        ],
-       "code_uid": "Select.0.40.16.55-randbc53979d",
        "disabled": false,
        "hidden": false,
        "label": "Imbalance Treatment",
-       "model_id": "fa34ae8676274bb192425ae3901ca186",
        "url_key": "",
        "value": "none",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fa34ae8676274bb192425ae3901ca186",
        "version_major": 2,
        "version_minor": 0
       },
@@ -990,17 +1090,17 @@
         "decision_tree",
         "xgboost"
        ],
-       "code_uid": "Select.0.40.16.60-rand0bc5431d",
        "disabled": false,
        "hidden": false,
        "label": "Model Selection",
-       "model_id": "2b5c968f1dc74736910ad206a4c55af0",
        "url_key": "",
        "value": "random_forest",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2b5c968f1dc74736910ad206a4c55af0",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1067,6 +1167,10 @@
     "input_n_neighbors = 5                  # only for knn imputation\n",
     "input_imputation_method = str(input_imputation_method.value)\n",
     "\n",
     "# input imbalance treatment variables\n",
     "input_imbalance_treatment = mr.Select(label=\"Imbalance Treatment\", value=\"none\", choices=['none', 'smote', 'undersampling', 'rose'])       # 'none', 'smote', 'undersampling', 'rose'\n",
     "input_imbalance_treatment = str(input_imbalance_treatment.value)\n",
@@ -1104,7 +1208,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -1182,7 +1286,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -1220,7 +1324,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"

   },
   {
    "cell_type": "code",
+   "execution_count": 42,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
   },
   {
    "cell_type": "code",
+   "execution_count": 43,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
      "data": {
       "application/mercury+json": {
        "allow_download": true,
+       "code_uid": "App.0.40.24.1-rand0e93859a",
        "continuous_update": false,
        "description": "Recumpute everything dynamically",
        "full_screen": true,
   },
   {
    "cell_type": "code",
+   "execution_count": 44,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
    },
    "outputs": [],
    "source": [
+    "   \n",
     "# Read the features data from the the url of csv into pandas dataframes and rename the columns to F1, F2, F3, etc.\n",
     "# Read the labels data from the url of csv into pandas dataframes and rename the columns to pass/fail and date/time\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 45,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
   },
   {
    "cell_type": "code",
+   "execution_count": 46,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
   },
   {
    "cell_type": "code",
+   "execution_count": 47,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
   },
   {
    "cell_type": "code",
+   "execution_count": 48,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
   },
   {
    "cell_type": "code",
+   "execution_count": 49,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
     "        return df_imputed\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def feature_selection(method, X_train, y_train):\n",
+    "\n",
+    "    global feature_selection_var\n",
+    "    global selected_features    \n",
+    "\n",
+    "    if method == 'boruta':\n",
+    "        print('Selected method is: ', method)\n",
+    "        from boruta import BorutaPy\n",
+    "        from sklearn.ensemble import RandomForestClassifier\n",
+    "        rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)\n",
+    "        boruta_selector = BorutaPy(rf,n_estimators='auto', verbose=0, random_state=42)\n",
+    "        boruta_selector.fit(X_train.values, y_train.values.ravel())\n",
+    "        selected_feature_indices = boruta_selector.support_\n",
+    "        selected_columns = X_train.columns[selected_feature_indices]\n",
+    "        X_train_filtered = X_train.iloc[:, selected_feature_indices]\n",
+    "        print('Shape of the training set after feature selection with Boruta: ', X_train_filtered.shape)\n",
+    "        return X_train_filtered, selected_columns\n",
+    "    \n",
+    "    if method == 'none':\n",
+    "        print('Selected method is: ', method)\n",
+    "        X_train_filtered = X_train\n",
+    "        print('Shape of the training set after no feature selection: ', X_train_filtered.shape)\n",
+    "        feature_selection_var = 'none'\n",
+    "        selected_features = X_train_filtered.columns\n",
+    "        return X_train_filtered, selected_features        \n",
+    "    \n",
+    "    if method == 'lasso':\n",
+    "        print('Selected method is: ', method)\n",
+    "        from sklearn.linear_model import LassoCV\n",
+    "        from sklearn.feature_selection import SelectFromModel\n",
+    "        lasso = LassoCV().fit(X_train, y_train)\n",
+    "        model = SelectFromModel(lasso, prefit=True)\n",
+    "        X_train_filtered = model.transform(X_train)\n",
+    "        selected_features = X_train.columns[model.get_support()]\n",
+    "        print('Shape of the training set after feature selection with LassoCV: ', X_train_filtered.shape)\n",
+    "        feature_selection_var = 'lasso'\n",
+    "        return X_train_filtered, selected_features\n",
+    "    \n",
+    "    if method == 'pca':\n",
+    "        print('Selected method is: ', method)\n",
+    "        from sklearn.decomposition import PCA\n",
+    "        pca = PCA(n_components=15)\n",
+    "        X_train_pca = pca.fit_transform(X_train)\n",
+    "        selected_features = X_train.columns[pca.explained_variance_ratio_.argsort()[::-1]][:15]\n",
+    "        print('Shape of the training set after feature selection with PCA: ', X_train_pca.shape)\n",
+    "        feature_selection_var = 'pca'\n",
+    "        return X_train_pca, selected_features\n",
+    "    \n",
+    "    if method == 'rfe':\n",
+    "        print('Selected method is: ', method)\n",
+    "        from sklearn.feature_selection import RFE\n",
+    "        from sklearn.ensemble import RandomForestClassifier\n",
+    "        rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, n_jobs=-1), n_features_to_select=15, step=10, verbose=0)\n",
+    "        rfe_selector.fit(X_train, y_train)\n",
+    "        selected_features = X_train.columns[rfe_selector.support_]\n",
+    "        X_train_filtered = X_train.iloc[:, rfe_selector.support_]\n",
+    "        print('Shape of the training set after feature selection with RFE: ', X_train_filtered.shape)\n",
+    "        feature_selection_var = 'rfe'\n",
+    "        return X_train_filtered, selected_features\n",
+    "    "
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
   },
   {
    "cell_type": "code",
+   "execution_count": 51,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
   },
   {
    "cell_type": "code",
+   "execution_count": 52,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
   },
   {
    "cell_type": "code",
+   "execution_count": 53,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
   },
   {
    "cell_type": "code",
+   "execution_count": 54,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
         "yes",
         "no"
        ],
+       "code_uid": "Select.0.40.16.25-rand98b210b9",
        "disabled": false,
        "hidden": false,
        "label": "Drop Duplicates",
+       "model_id": "f2d0fb31478a477ea1b0d4c4aa80fb2e",
        "url_key": "",
        "value": "yes",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f2d0fb31478a477ea1b0d4c4aa80fb2e",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/mercury+json": {
+       "code_uid": "Text.0.40.15.28-randc77c765e",
        "disabled": false,
        "hidden": false,
        "label": "Missing Value Threeshold",
+       "model_id": "f97f46b880e3434082498e1e720b0b65",
        "rows": 1,
        "url_key": "",
        "value": "80",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f97f46b880e3434082498e1e720b0b65",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/mercury+json": {
+       "code_uid": "Text.0.40.15.31-rand54dd8817",
        "disabled": false,
        "hidden": false,
        "label": "Variance Threshold",
+       "model_id": "4839b45f7e9b483d9fb8ee7fc05f1e19",
        "rows": 1,
        "url_key": "",
        "value": "0",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4839b45f7e9b483d9fb8ee7fc05f1e19",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/mercury+json": {
+       "code_uid": "Text.0.40.15.34-rand811824bd",
        "disabled": false,
        "hidden": false,
        "label": "Correlation Threshold",
+       "model_id": "10033d424ab949f7b51462e444e17ba7",
        "rows": 1,
        "url_key": "",
        "value": "1",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "10033d424ab949f7b51462e444e17ba7",
        "version_major": 2,
        "version_minor": 0
       },
         4,
         5
        ],
+       "code_uid": "Select.0.40.16.38-rand10d00d99",
        "disabled": false,
        "hidden": false,
        "label": "Outlier Removal Threshold",
+       "model_id": "96b8980bceaf46459d9ec06c8fb7c818",
        "url_key": "",
        "value": "none",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "96b8980bceaf46459d9ec06c8fb7c818",
        "version_major": 2,
        "version_minor": 0
       },
         "minmax",
         "robust"
        ],
+       "code_uid": "Select.0.40.16.46-rand1bc79c9d",
        "disabled": false,
        "hidden": false,
        "label": "Scaling Variables",
+       "model_id": "e7650cea7a834d588a995407052e1f2c",
        "url_key": "",
        "value": "none",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e7650cea7a834d588a995407052e1f2c",
        "version_major": 2,
        "version_minor": 0
       },
         "knn",
         "most_frequent"
        ],
+       "code_uid": "Select.0.40.16.50-rand69ae31a0",
        "disabled": false,
        "hidden": false,
        "label": "Imputation Methods",
+       "model_id": "652a64af16174970919183e6ab1c5b53",
        "url_key": "",
        "value": "mean",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "652a64af16174970919183e6ab1c5b53",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "mercury.Select"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/mercury+json": {
+       "choices": [
+        "none",
+        "lasso",
+        "rfe",
+        "pca",
+        "boruta"
+       ],
+       "code_uid": "Select.0.40.16.55-rand148632f9",
+       "disabled": false,
+       "hidden": false,
+       "label": "Feature Selection",
+       "model_id": "9a1a199471314cd7a3363ea25d9d341a",
+       "url_key": "",
+       "value": "none",
+       "widget": "Select"
+      },
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9a1a199471314cd7a3363ea25d9d341a",
        "version_major": 2,
        "version_minor": 0
       },
         "undersampling",
         "rose"
        ],
+       "code_uid": "Select.0.40.16.59-rand3a34b3e3",
        "disabled": false,
        "hidden": false,
        "label": "Imbalance Treatment",
+       "model_id": "358dd80171af4de2a944c3077b2f48d8",
        "url_key": "",
        "value": "none",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "358dd80171af4de2a944c3077b2f48d8",
        "version_major": 2,
        "version_minor": 0
       },
         "decision_tree",
         "xgboost"
        ],
+       "code_uid": "Select.0.40.16.64-rand4b9cf5e0",
        "disabled": false,
        "hidden": false,
        "label": "Model Selection",
+       "model_id": "8477ca5211bd4914861b3e48cda21c10",
        "url_key": "",
        "value": "random_forest",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8477ca5211bd4914861b3e48cda21c10",
        "version_major": 2,
        "version_minor": 0
       },
     "input_n_neighbors = 5                  # only for knn imputation\n",
     "input_imputation_method = str(input_imputation_method.value)\n",
     "\n",
+    "# input feature selection variables\n",
+    "input_feature_selection = mr.Select(label=\"Feature Selection\", value=\"none\", choices=['none', 'lasso', 'rfe', 'pca', 'boruta'])          # 'none', 'lasso', 'rfe', 'pca', 'boruta'\n",
+    "input_feature_selection = str(input_feature_selection.value)\n",
+    "\n",
     "# input imbalance treatment variables\n",
     "input_imbalance_treatment = mr.Select(label=\"Imbalance Treatment\", value=\"none\", choices=['none', 'smote', 'undersampling', 'rose'])       # 'none', 'smote', 'undersampling', 'rose'\n",
     "input_imbalance_treatment = str(input_imbalance_treatment.value)\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 55,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
   },
   {
    "cell_type": "code",
+   "execution_count": 56,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
   },
   {
    "cell_type": "code",
+   "execution_count": 57,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"