erjonb commited on
Commit
3f6fe9c
·
1 Parent(s): 51f4ca9

Upload P2 - Secom Notebook2 - Mercury.ipynb

Browse files
Files changed (1) hide show
  1. P2 - Secom Notebook2 - Mercury.ipynb +159 -143
P2 - Secom Notebook2 - Mercury.ipynb CHANGED
@@ -26,8 +26,12 @@
26
  },
27
  {
28
  "cell_type": "code",
29
- "execution_count": 357,
30
- "metadata": {},
 
 
 
 
31
  "outputs": [],
32
  "source": [
33
  "# import pandas for data manipulation\n",
@@ -53,14 +57,18 @@
53
  },
54
  {
55
  "cell_type": "code",
56
- "execution_count": 358,
57
- "metadata": {},
 
 
 
 
58
  "outputs": [
59
  {
60
  "data": {
61
  "application/mercury+json": {
62
  "allow_download": true,
63
- "code_uid": "App.0.40.24.1-randd9fe9ae5",
64
  "continuous_update": false,
65
  "description": "Recumpute everything dynamically",
66
  "full_screen": true,
@@ -92,8 +100,12 @@
92
  },
93
  {
94
  "cell_type": "code",
95
- "execution_count": 359,
96
- "metadata": {},
 
 
 
 
97
  "outputs": [],
98
  "source": [
99
  "# Read the features data from the the url of csv into pandas dataframes and rename the columns to F1, F2, F3, etc.\n",
@@ -129,24 +141,28 @@
129
  },
130
  {
131
  "cell_type": "code",
132
- "execution_count": 360,
133
- "metadata": {},
 
 
 
 
134
  "outputs": [
135
  {
136
  "data": {
137
  "application/mercury+json": {
138
- "code_uid": "Text.0.40.15.11-randec98731b",
139
  "disabled": false,
140
  "hidden": false,
141
  "label": "Test Size Ratio",
142
- "model_id": "2157a02ec6544d86bd12bf1e3a15f65e",
143
  "rows": 1,
144
  "url_key": "",
145
  "value": "0.25",
146
  "widget": "Text"
147
  },
148
  "application/vnd.jupyter.widget-view+json": {
149
- "model_id": "2157a02ec6544d86bd12bf1e3a15f65e",
150
  "version_major": 2,
151
  "version_minor": 0
152
  },
@@ -160,18 +176,18 @@
160
  {
161
  "data": {
162
  "application/mercury+json": {
163
- "code_uid": "Text.0.40.15.14-randfa24ca10",
164
  "disabled": false,
165
  "hidden": false,
166
  "label": "Random State Integer",
167
- "model_id": "cdaf85c404494bae95a32286425b9034",
168
  "rows": 1,
169
  "url_key": "",
170
  "value": "13",
171
  "widget": "Text"
172
  },
173
  "application/vnd.jupyter.widget-view+json": {
174
- "model_id": "cdaf85c404494bae95a32286425b9034",
175
  "version_major": 2,
176
  "version_minor": 0
177
  },
@@ -220,8 +236,12 @@
220
  },
221
  {
222
  "cell_type": "code",
223
- "execution_count": 361,
224
- "metadata": {},
 
 
 
 
225
  "outputs": [],
226
  "source": [
227
  "def columns_to_drop(df,drop_duplicates='yes', missing_values_threshold=100, variance_threshold=0, \n",
@@ -302,7 +322,6 @@
302
  " global correlation_threshold_var\n",
303
  " correlation_threshold_var = correlation_threshold\n",
304
  " \n",
305
- " print(type(dropped))\n",
306
  " return dropped"
307
  ]
308
  },
@@ -320,8 +339,12 @@
320
  },
321
  {
322
  "cell_type": "code",
323
- "execution_count": 362,
324
- "metadata": {},
 
 
 
 
325
  "outputs": [],
326
  "source": [
327
  "def outlier_removal(z_df, z_threshold=4):\n",
@@ -369,8 +392,12 @@
369
  },
370
  {
371
  "cell_type": "code",
372
- "execution_count": 363,
373
- "metadata": {},
 
 
 
 
374
  "outputs": [],
375
  "source": [
376
  "# define a function to scale the dataframe using different scaling models\n",
@@ -444,8 +471,12 @@
444
  },
445
  {
446
  "cell_type": "code",
447
- "execution_count": 364,
448
- "metadata": {},
 
 
 
 
449
  "outputs": [],
450
  "source": [
451
  "# define a function to impute missing values using different imputation models\n",
@@ -529,8 +560,12 @@
529
  },
530
  {
531
  "cell_type": "code",
532
- "execution_count": 365,
533
- "metadata": {},
 
 
 
 
534
  "outputs": [],
535
  "source": [
536
  "def feature_selection(method, X_train, y_train):\n",
@@ -615,8 +650,12 @@
615
  },
616
  {
617
  "cell_type": "code",
618
- "execution_count": 366,
619
- "metadata": {},
 
 
 
 
620
  "outputs": [],
621
  "source": [
622
  "#define a function to oversample and understamble the imbalance in the training set\n",
@@ -632,7 +671,7 @@
632
  " sm = SMOTE(random_state=42)\n",
633
  " X_train_res, y_train_res = sm.fit_resample(X_train, y_train)\n",
634
  " imbalance_report0 = 'Shape of the training set after oversampling with SMOTE: ', X_train_res.shape\n",
635
- " imbalance_report1 = 'Value counts of the target variable after oversampling with SMOTE: \\n', y_train_res.value_counts()\n",
636
  " imbalance_var = 'smote'\n",
637
  " return X_train_res, y_train_res\n",
638
  " \n",
@@ -641,7 +680,7 @@
641
  " rus = RandomUnderSampler(random_state=42)\n",
642
  " X_train_res, y_train_res = rus.fit_resample(X_train, y_train)\n",
643
  " imbalance_report0 = 'Shape of the training set after undersampling with RandomUnderSampler: ', X_train_res.shape\n",
644
- " imbalance_report1 = 'Value counts of the target variable after undersampling with RandomUnderSampler: \\n', y_train_res.value_counts()\n",
645
  " imbalance_var = 'undersampling'\n",
646
  " return X_train_res, y_train_res\n",
647
  " \n",
@@ -650,7 +689,7 @@
650
  " ros = RandomOverSampler(random_state=42)\n",
651
  " X_train_res, y_train_res = ros.fit_resample(X_train, y_train)\n",
652
  " imbalance_report0 = 'Shape of the training set after oversampling with RandomOverSampler: ', X_train_res.shape\n",
653
- " imbalance_report1 = 'Value counts of the target variable after oversampling with RandomOverSampler: \\n', y_train_res.value_counts()\n",
654
  " imbalance_var = 'rose'\n",
655
  " return X_train_res, y_train_res\n",
656
  " \n",
@@ -659,7 +698,7 @@
659
  " X_train_res = X_train\n",
660
  " y_train_res = y_train\n",
661
  " imbalance_report0 = 'Shape of the training set after no resampling: ', X_train_res.shape\n",
662
- " imbalance_report1 = 'Value counts of the target variable after no resampling: \\n', y_train_res.value_counts()\n",
663
  " imbalance_var = 'none'\n",
664
  " return X_train_res, y_train_res\n",
665
  " \n",
@@ -684,8 +723,12 @@
684
  },
685
  {
686
  "cell_type": "code",
687
- "execution_count": 367,
688
- "metadata": {},
 
 
 
 
689
  "outputs": [],
690
  "source": [
691
  "# define a function where you can choose the model you want to use to train the data\n",
@@ -757,8 +800,12 @@
757
  },
758
  {
759
  "cell_type": "code",
760
- "execution_count": 368,
761
- "metadata": {},
 
 
 
 
762
  "outputs": [],
763
  "source": [
764
  "evaluation_score_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'model_variables'])\n",
@@ -779,15 +826,17 @@
779
  },
780
  {
781
  "cell_type": "code",
782
- "execution_count": 369,
783
- "metadata": {},
 
 
 
 
784
  "outputs": [],
785
  "source": [
786
  "#define a function that prints the strings below\n",
787
  "def evaluate_models(model='random_forest'):\n",
788
  " \n",
789
- " print('--------------------------------------------------')\n",
790
- "\n",
791
  " all_models = ['random_forest', 'logistic_regression', 'knn', 'svm', 'naive_bayes', 'decision_tree', 'xgboost']\n",
792
  " evaluation_score_append = []\n",
793
  " evaluation_count_append = []\n",
@@ -882,24 +931,28 @@
882
  },
883
  {
884
  "cell_type": "code",
885
- "execution_count": 370,
886
- "metadata": {},
 
 
 
 
887
  "outputs": [
888
  {
889
  "data": {
890
  "application/mercury+json": {
891
- "code_uid": "Text.0.40.15.8-rand4a43baec",
892
  "disabled": false,
893
  "hidden": false,
894
  "label": "Missing Value Threeshold",
895
- "model_id": "b2736e53364e4041b6ce10b9e1e1f7d8",
896
  "rows": 1,
897
  "url_key": "",
898
  "value": "50",
899
  "widget": "Text"
900
  },
901
  "application/vnd.jupyter.widget-view+json": {
902
- "model_id": "b2736e53364e4041b6ce10b9e1e1f7d8",
903
  "version_major": 2,
904
  "version_minor": 0
905
  },
@@ -913,18 +966,18 @@
913
  {
914
  "data": {
915
  "application/mercury+json": {
916
- "code_uid": "Text.0.40.15.11-rand6f838484",
917
  "disabled": false,
918
  "hidden": false,
919
  "label": "Variance Threshold",
920
- "model_id": "97419c4a49954b8490aa311870d010b9",
921
  "rows": 1,
922
  "url_key": "",
923
  "value": "0.05",
924
  "widget": "Text"
925
  },
926
  "application/vnd.jupyter.widget-view+json": {
927
- "model_id": "97419c4a49954b8490aa311870d010b9",
928
  "version_major": 2,
929
  "version_minor": 0
930
  },
@@ -938,18 +991,18 @@
938
  {
939
  "data": {
940
  "application/mercury+json": {
941
- "code_uid": "Text.0.40.15.14-rand6243cbfa",
942
  "disabled": false,
943
  "hidden": false,
944
  "label": "Correlation Threshold",
945
- "model_id": "e9f072dfb6a241bca69f960fa0aa06a1",
946
  "rows": 1,
947
  "url_key": "",
948
  "value": "0.95",
949
  "widget": "Text"
950
  },
951
  "application/vnd.jupyter.widget-view+json": {
952
- "model_id": "e9f072dfb6a241bca69f960fa0aa06a1",
953
  "version_major": 2,
954
  "version_minor": 0
955
  },
@@ -969,17 +1022,17 @@
969
  4,
970
  5
971
  ],
972
- "code_uid": "Select.0.40.16.18-randa184b437",
973
  "disabled": false,
974
  "hidden": false,
975
  "label": "Outlier Removal Threshold",
976
- "model_id": "0be493385a154210b3c7685a3bd1074f",
977
  "url_key": "",
978
  "value": 5,
979
  "widget": "Select"
980
  },
981
  "application/vnd.jupyter.widget-view+json": {
982
- "model_id": "0be493385a154210b3c7685a3bd1074f",
983
  "version_major": 2,
984
  "version_minor": 0
985
  },
@@ -999,17 +1052,17 @@
999
  "minmax",
1000
  "robust"
1001
  ],
1002
- "code_uid": "Select.0.40.16.25-rand163d8992",
1003
  "disabled": false,
1004
  "hidden": false,
1005
  "label": "Scaling Variables",
1006
- "model_id": "985eab871677416f9c14ea528b0fd561",
1007
  "url_key": "",
1008
  "value": "standard",
1009
  "widget": "Select"
1010
  },
1011
  "application/vnd.jupyter.widget-view+json": {
1012
- "model_id": "985eab871677416f9c14ea528b0fd561",
1013
  "version_major": 2,
1014
  "version_minor": 0
1015
  },
@@ -1029,17 +1082,17 @@
1029
  "knn",
1030
  "most_frequent"
1031
  ],
1032
- "code_uid": "Select.0.40.16.29-randb76d7c1d",
1033
  "disabled": false,
1034
  "hidden": false,
1035
  "label": "Imputation Methods",
1036
- "model_id": "eef6b42e02914c98b7e7ed8d0a18df98",
1037
  "url_key": "",
1038
  "value": "median",
1039
  "widget": "Select"
1040
  },
1041
  "application/vnd.jupyter.widget-view+json": {
1042
- "model_id": "eef6b42e02914c98b7e7ed8d0a18df98",
1043
  "version_major": 2,
1044
  "version_minor": 0
1045
  },
@@ -1060,17 +1113,17 @@
1060
  "pca",
1061
  "boruta"
1062
  ],
1063
- "code_uid": "Select.0.40.16.34-rand254bd909",
1064
  "disabled": false,
1065
  "hidden": false,
1066
  "label": "Feature Selection",
1067
- "model_id": "f4fc58b330a24bfe8699e0602178b0e1",
1068
  "url_key": "",
1069
  "value": "lasso",
1070
  "widget": "Select"
1071
  },
1072
  "application/vnd.jupyter.widget-view+json": {
1073
- "model_id": "f4fc58b330a24bfe8699e0602178b0e1",
1074
  "version_major": 2,
1075
  "version_minor": 0
1076
  },
@@ -1090,17 +1143,17 @@
1090
  "undersampling",
1091
  "rose"
1092
  ],
1093
- "code_uid": "Select.0.40.16.38-rand75e4d938",
1094
  "disabled": false,
1095
  "hidden": false,
1096
  "label": "Imbalance Treatment",
1097
- "model_id": "965a81a69265473a830f8eec5e8ba2df",
1098
  "url_key": "",
1099
  "value": "smote",
1100
  "widget": "Select"
1101
  },
1102
  "application/vnd.jupyter.widget-view+json": {
1103
- "model_id": "965a81a69265473a830f8eec5e8ba2df",
1104
  "version_major": 2,
1105
  "version_minor": 0
1106
  },
@@ -1123,17 +1176,17 @@
1123
  "decision_tree",
1124
  "xgboost"
1125
  ],
1126
- "code_uid": "Select.0.40.16.42-rand1bbd78ac",
1127
  "disabled": false,
1128
  "hidden": false,
1129
  "label": "Model Selection",
1130
- "model_id": "0d1b1477e14b44b99d00dc89dffb70cb",
1131
  "url_key": "",
1132
  "value": "random_forest",
1133
  "widget": "Select"
1134
  },
1135
  "application/vnd.jupyter.widget-view+json": {
1136
- "model_id": "0d1b1477e14b44b99d00dc89dffb70cb",
1137
  "version_major": 2,
1138
  "version_minor": 0
1139
  },
@@ -1143,13 +1196,6 @@
1143
  },
1144
  "metadata": {},
1145
  "output_type": "display_data"
1146
- },
1147
- {
1148
- "name": "stdout",
1149
- "output_type": "stream",
1150
- "text": [
1151
- "<class 'list'>\n"
1152
- ]
1153
  }
1154
  ],
1155
  "source": [
@@ -1245,17 +1291,13 @@
1245
  },
1246
  {
1247
  "cell_type": "code",
1248
- "execution_count": 371,
1249
- "metadata": {},
1250
- "outputs": [
1251
- {
1252
- "name": "stdout",
1253
- "output_type": "stream",
1254
- "text": [
1255
- "--------------------------------------------------\n"
1256
- ]
1257
  }
1258
- ],
 
1259
  "source": [
1260
  "evaluation_score_output, evaluation_counts_output = evaluate_models(input_model)"
1261
  ]
@@ -1263,62 +1305,31 @@
1263
  {
1264
  "attachments": {},
1265
  "cell_type": "markdown",
1266
- "metadata": {},
 
 
 
 
1267
  "source": [
1268
  "#### **Confusion Matrix**"
1269
  ]
1270
  },
1271
  {
1272
  "cell_type": "code",
1273
- "execution_count": 372,
1274
- "metadata": {},
 
 
 
 
1275
  "outputs": [
1276
  {
1277
- "data": {
1278
- "text/html": [
1279
- "<div>\n",
1280
- "<style scoped>\n",
1281
- " .dataframe tbody tr th:only-of-type {\n",
1282
- " vertical-align: middle;\n",
1283
- " }\n",
1284
- "\n",
1285
- " .dataframe tbody tr th {\n",
1286
- " vertical-align: top;\n",
1287
- " }\n",
1288
- "\n",
1289
- " .dataframe thead th {\n",
1290
- " text-align: right;\n",
1291
- " }\n",
1292
- "</style>\n",
1293
- "<table border=\"1\" class=\"dataframe\">\n",
1294
- " <thead>\n",
1295
- " <tr style=\"text-align: right;\">\n",
1296
- " <th></th>\n",
1297
- " <th>Accuracy</th>\n",
1298
- " <th>Precision</th>\n",
1299
- " <th>Recall</th>\n",
1300
- " <th>F1-score</th>\n",
1301
- " </tr>\n",
1302
- " </thead>\n",
1303
- " <tbody>\n",
1304
- " <tr>\n",
1305
- " <th>0</th>\n",
1306
- " <td>0.89</td>\n",
1307
- " <td>0.15</td>\n",
1308
- " <td>0.15</td>\n",
1309
- " <td>0.15</td>\n",
1310
- " </tr>\n",
1311
- " </tbody>\n",
1312
- "</table>\n",
1313
- "</div>"
1314
- ],
1315
- "text/plain": [
1316
- " Accuracy Precision Recall F1-score\n",
1317
- "0 0.89 0.15 0.15 0.15"
1318
- ]
1319
- },
1320
- "metadata": {},
1321
- "output_type": "display_data"
1322
  },
1323
  {
1324
  "data": {
@@ -1343,29 +1354,36 @@
1343
  " show_normed=True\n",
1344
  ")\n",
1345
  "\n",
1346
- "display(evaluation_score_output[['Accuracy', 'Precision', 'Recall', 'F1-score']])"
 
1347
  ]
1348
  },
1349
  {
1350
  "attachments": {},
1351
  "cell_type": "markdown",
1352
- "metadata": {},
 
 
 
 
1353
  "source": [
1354
  "### **Transformations Report**"
1355
  ]
1356
  },
1357
  {
1358
  "cell_type": "code",
1359
- "execution_count": 373,
1360
- "metadata": {},
 
 
 
 
1361
  "outputs": [
1362
  {
1363
  "name": "stdout",
1364
  "output_type": "stream",
1365
  "text": [
1366
- "------------------------------------------\n",
1367
  "FEATURE REMOVAL\n",
1368
- "('Shape of the dataframe is:', (1175, 590))\n",
1369
  "('the number of columns dropped due to duplications is: ', 104)\n",
1370
  "('the number of columns dropped due to missing values is: ', 28)\n",
1371
  "('the number of columns dropped due to low variance is: ', 189)\n",
@@ -1391,7 +1409,7 @@
1391
  "------------------------------------------\n",
1392
  "IMBALANCE TREATMENT\n",
1393
  "('Shape of the training set after oversampling with SMOTE: ', (2194, 14))\n",
1394
- "('Value counts of the target variable after oversampling with SMOTE: \\n', pass/fail\n",
1395
  "0 1097\n",
1396
  "1 1097\n",
1397
  "dtype: int64)\n"
@@ -1399,9 +1417,7 @@
1399
  }
1400
  ],
1401
  "source": [
1402
- "print('------------------------------------------')\n",
1403
  "print('FEATURE REMOVAL')\n",
1404
- "print(feature_removal_report0)\n",
1405
  "print(feature_removal_report1)\n",
1406
  "print(feature_removal_report2)\n",
1407
  "print(feature_removal_report3)\n",
 
26
  },
27
  {
28
  "cell_type": "code",
29
+ "execution_count": 431,
30
+ "metadata": {
31
+ "slideshow": {
32
+ "slide_type": "skip"
33
+ }
34
+ },
35
  "outputs": [],
36
  "source": [
37
  "# import pandas for data manipulation\n",
 
57
  },
58
  {
59
  "cell_type": "code",
60
+ "execution_count": 432,
61
+ "metadata": {
62
+ "slideshow": {
63
+ "slide_type": "skip"
64
+ }
65
+ },
66
  "outputs": [
67
  {
68
  "data": {
69
  "application/mercury+json": {
70
  "allow_download": true,
71
+ "code_uid": "App.0.40.24.1-randf68a3764",
72
  "continuous_update": false,
73
  "description": "Recumpute everything dynamically",
74
  "full_screen": true,
 
100
  },
101
  {
102
  "cell_type": "code",
103
+ "execution_count": 433,
104
+ "metadata": {
105
+ "slideshow": {
106
+ "slide_type": "skip"
107
+ }
108
+ },
109
  "outputs": [],
110
  "source": [
111
  "# Read the features data from the the url of csv into pandas dataframes and rename the columns to F1, F2, F3, etc.\n",
 
141
  },
142
  {
143
  "cell_type": "code",
144
+ "execution_count": 434,
145
+ "metadata": {
146
+ "slideshow": {
147
+ "slide_type": "skip"
148
+ }
149
+ },
150
  "outputs": [
151
  {
152
  "data": {
153
  "application/mercury+json": {
154
+ "code_uid": "Text.0.40.15.11-randa5faa9c1",
155
  "disabled": false,
156
  "hidden": false,
157
  "label": "Test Size Ratio",
158
+ "model_id": "a2eb64736c1146fc835a6b2afa84c9c8",
159
  "rows": 1,
160
  "url_key": "",
161
  "value": "0.25",
162
  "widget": "Text"
163
  },
164
  "application/vnd.jupyter.widget-view+json": {
165
+ "model_id": "a2eb64736c1146fc835a6b2afa84c9c8",
166
  "version_major": 2,
167
  "version_minor": 0
168
  },
 
176
  {
177
  "data": {
178
  "application/mercury+json": {
179
+ "code_uid": "Text.0.40.15.14-rand83abdf01",
180
  "disabled": false,
181
  "hidden": false,
182
  "label": "Random State Integer",
183
+ "model_id": "7c9d97ed67cb4252a11f2802fc495482",
184
  "rows": 1,
185
  "url_key": "",
186
  "value": "13",
187
  "widget": "Text"
188
  },
189
  "application/vnd.jupyter.widget-view+json": {
190
+ "model_id": "7c9d97ed67cb4252a11f2802fc495482",
191
  "version_major": 2,
192
  "version_minor": 0
193
  },
 
236
  },
237
  {
238
  "cell_type": "code",
239
+ "execution_count": 435,
240
+ "metadata": {
241
+ "slideshow": {
242
+ "slide_type": "skip"
243
+ }
244
+ },
245
  "outputs": [],
246
  "source": [
247
  "def columns_to_drop(df,drop_duplicates='yes', missing_values_threshold=100, variance_threshold=0, \n",
 
322
  " global correlation_threshold_var\n",
323
  " correlation_threshold_var = correlation_threshold\n",
324
  " \n",
 
325
  " return dropped"
326
  ]
327
  },
 
339
  },
340
  {
341
  "cell_type": "code",
342
+ "execution_count": 436,
343
+ "metadata": {
344
+ "slideshow": {
345
+ "slide_type": "skip"
346
+ }
347
+ },
348
  "outputs": [],
349
  "source": [
350
  "def outlier_removal(z_df, z_threshold=4):\n",
 
392
  },
393
  {
394
  "cell_type": "code",
395
+ "execution_count": 437,
396
+ "metadata": {
397
+ "slideshow": {
398
+ "slide_type": "skip"
399
+ }
400
+ },
401
  "outputs": [],
402
  "source": [
403
  "# define a function to scale the dataframe using different scaling models\n",
 
471
  },
472
  {
473
  "cell_type": "code",
474
+ "execution_count": 438,
475
+ "metadata": {
476
+ "slideshow": {
477
+ "slide_type": "skip"
478
+ }
479
+ },
480
  "outputs": [],
481
  "source": [
482
  "# define a function to impute missing values using different imputation models\n",
 
560
  },
561
  {
562
  "cell_type": "code",
563
+ "execution_count": 439,
564
+ "metadata": {
565
+ "slideshow": {
566
+ "slide_type": "skip"
567
+ }
568
+ },
569
  "outputs": [],
570
  "source": [
571
  "def feature_selection(method, X_train, y_train):\n",
 
650
  },
651
  {
652
  "cell_type": "code",
653
+ "execution_count": 440,
654
+ "metadata": {
655
+ "slideshow": {
656
+ "slide_type": "skip"
657
+ }
658
+ },
659
  "outputs": [],
660
  "source": [
661
  "#define a function to oversample and understamble the imbalance in the training set\n",
 
671
  " sm = SMOTE(random_state=42)\n",
672
  " X_train_res, y_train_res = sm.fit_resample(X_train, y_train)\n",
673
  " imbalance_report0 = 'Shape of the training set after oversampling with SMOTE: ', X_train_res.shape\n",
674
+ " imbalance_report1 = 'Value counts of the target variable after oversampling with SMOTE: ', y_train_res.value_counts()\n",
675
  " imbalance_var = 'smote'\n",
676
  " return X_train_res, y_train_res\n",
677
  " \n",
 
680
  " rus = RandomUnderSampler(random_state=42)\n",
681
  " X_train_res, y_train_res = rus.fit_resample(X_train, y_train)\n",
682
  " imbalance_report0 = 'Shape of the training set after undersampling with RandomUnderSampler: ', X_train_res.shape\n",
683
+ " imbalance_report1 = 'Value counts of the target variable after undersampling with RandomUnderSampler: ', y_train_res.value_counts()\n",
684
  " imbalance_var = 'undersampling'\n",
685
  " return X_train_res, y_train_res\n",
686
  " \n",
 
689
  " ros = RandomOverSampler(random_state=42)\n",
690
  " X_train_res, y_train_res = ros.fit_resample(X_train, y_train)\n",
691
  " imbalance_report0 = 'Shape of the training set after oversampling with RandomOverSampler: ', X_train_res.shape\n",
692
+ " imbalance_report1 = 'Value counts of the target variable after oversampling with RandomOverSampler: ', y_train_res.value_counts()\n",
693
  " imbalance_var = 'rose'\n",
694
  " return X_train_res, y_train_res\n",
695
  " \n",
 
698
  " X_train_res = X_train\n",
699
  " y_train_res = y_train\n",
700
  " imbalance_report0 = 'Shape of the training set after no resampling: ', X_train_res.shape\n",
701
+ " imbalance_report1 = 'Value counts of the target variable after no resampling: ', y_train_res.value_counts()\n",
702
  " imbalance_var = 'none'\n",
703
  " return X_train_res, y_train_res\n",
704
  " \n",
 
723
  },
724
  {
725
  "cell_type": "code",
726
+ "execution_count": 441,
727
+ "metadata": {
728
+ "slideshow": {
729
+ "slide_type": "skip"
730
+ }
731
+ },
732
  "outputs": [],
733
  "source": [
734
  "# define a function where you can choose the model you want to use to train the data\n",
 
800
  },
801
  {
802
  "cell_type": "code",
803
+ "execution_count": 442,
804
+ "metadata": {
805
+ "slideshow": {
806
+ "slide_type": "skip"
807
+ }
808
+ },
809
  "outputs": [],
810
  "source": [
811
  "evaluation_score_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'model_variables'])\n",
 
826
  },
827
  {
828
  "cell_type": "code",
829
+ "execution_count": 443,
830
+ "metadata": {
831
+ "slideshow": {
832
+ "slide_type": "skip"
833
+ }
834
+ },
835
  "outputs": [],
836
  "source": [
837
  "#define a function that prints the strings below\n",
838
  "def evaluate_models(model='random_forest'):\n",
839
  " \n",
 
 
840
  " all_models = ['random_forest', 'logistic_regression', 'knn', 'svm', 'naive_bayes', 'decision_tree', 'xgboost']\n",
841
  " evaluation_score_append = []\n",
842
  " evaluation_count_append = []\n",
 
931
  },
932
  {
933
  "cell_type": "code",
934
+ "execution_count": 444,
935
+ "metadata": {
936
+ "slideshow": {
937
+ "slide_type": "skip"
938
+ }
939
+ },
940
  "outputs": [
941
  {
942
  "data": {
943
  "application/mercury+json": {
944
+ "code_uid": "Text.0.40.15.8-rand27c6053f",
945
  "disabled": false,
946
  "hidden": false,
947
  "label": "Missing Value Threeshold",
948
+ "model_id": "9bf214b16a4342099c9edd6fdda6cca9",
949
  "rows": 1,
950
  "url_key": "",
951
  "value": "50",
952
  "widget": "Text"
953
  },
954
  "application/vnd.jupyter.widget-view+json": {
955
+ "model_id": "9bf214b16a4342099c9edd6fdda6cca9",
956
  "version_major": 2,
957
  "version_minor": 0
958
  },
 
966
  {
967
  "data": {
968
  "application/mercury+json": {
969
+ "code_uid": "Text.0.40.15.11-rand5d52d01b",
970
  "disabled": false,
971
  "hidden": false,
972
  "label": "Variance Threshold",
973
+ "model_id": "98b6b9bb59ec43f1bc6c824e38f4eddd",
974
  "rows": 1,
975
  "url_key": "",
976
  "value": "0.05",
977
  "widget": "Text"
978
  },
979
  "application/vnd.jupyter.widget-view+json": {
980
+ "model_id": "98b6b9bb59ec43f1bc6c824e38f4eddd",
981
  "version_major": 2,
982
  "version_minor": 0
983
  },
 
991
  {
992
  "data": {
993
  "application/mercury+json": {
994
+ "code_uid": "Text.0.40.15.14-randd7d692a8",
995
  "disabled": false,
996
  "hidden": false,
997
  "label": "Correlation Threshold",
998
+ "model_id": "b4e4bb3cc6414fcaa12c01b283081d96",
999
  "rows": 1,
1000
  "url_key": "",
1001
  "value": "0.95",
1002
  "widget": "Text"
1003
  },
1004
  "application/vnd.jupyter.widget-view+json": {
1005
+ "model_id": "b4e4bb3cc6414fcaa12c01b283081d96",
1006
  "version_major": 2,
1007
  "version_minor": 0
1008
  },
 
1022
  4,
1023
  5
1024
  ],
1025
+ "code_uid": "Select.0.40.16.18-rand6188731c",
1026
  "disabled": false,
1027
  "hidden": false,
1028
  "label": "Outlier Removal Threshold",
1029
+ "model_id": "48828625c53c4fe9ae8ad3abdab7bca6",
1030
  "url_key": "",
1031
  "value": 5,
1032
  "widget": "Select"
1033
  },
1034
  "application/vnd.jupyter.widget-view+json": {
1035
+ "model_id": "48828625c53c4fe9ae8ad3abdab7bca6",
1036
  "version_major": 2,
1037
  "version_minor": 0
1038
  },
 
1052
  "minmax",
1053
  "robust"
1054
  ],
1055
+ "code_uid": "Select.0.40.16.25-rand4ff0ac92",
1056
  "disabled": false,
1057
  "hidden": false,
1058
  "label": "Scaling Variables",
1059
+ "model_id": "4268185d86f34c559e1444de3c1739d9",
1060
  "url_key": "",
1061
  "value": "standard",
1062
  "widget": "Select"
1063
  },
1064
  "application/vnd.jupyter.widget-view+json": {
1065
+ "model_id": "4268185d86f34c559e1444de3c1739d9",
1066
  "version_major": 2,
1067
  "version_minor": 0
1068
  },
 
1082
  "knn",
1083
  "most_frequent"
1084
  ],
1085
+ "code_uid": "Select.0.40.16.29-rand9bb317f9",
1086
  "disabled": false,
1087
  "hidden": false,
1088
  "label": "Imputation Methods",
1089
+ "model_id": "a147c118c8f14de28b280232786f146a",
1090
  "url_key": "",
1091
  "value": "median",
1092
  "widget": "Select"
1093
  },
1094
  "application/vnd.jupyter.widget-view+json": {
1095
+ "model_id": "a147c118c8f14de28b280232786f146a",
1096
  "version_major": 2,
1097
  "version_minor": 0
1098
  },
 
1113
  "pca",
1114
  "boruta"
1115
  ],
1116
+ "code_uid": "Select.0.40.16.34-rand7cda1892",
1117
  "disabled": false,
1118
  "hidden": false,
1119
  "label": "Feature Selection",
1120
+ "model_id": "ed31020a12d842a9b6e77a88344adfd6",
1121
  "url_key": "",
1122
  "value": "lasso",
1123
  "widget": "Select"
1124
  },
1125
  "application/vnd.jupyter.widget-view+json": {
1126
+ "model_id": "ed31020a12d842a9b6e77a88344adfd6",
1127
  "version_major": 2,
1128
  "version_minor": 0
1129
  },
 
1143
  "undersampling",
1144
  "rose"
1145
  ],
1146
+ "code_uid": "Select.0.40.16.38-randc6301b14",
1147
  "disabled": false,
1148
  "hidden": false,
1149
  "label": "Imbalance Treatment",
1150
+ "model_id": "ef37d1810f974d2081c0cd9bed1d4384",
1151
  "url_key": "",
1152
  "value": "smote",
1153
  "widget": "Select"
1154
  },
1155
  "application/vnd.jupyter.widget-view+json": {
1156
+ "model_id": "ef37d1810f974d2081c0cd9bed1d4384",
1157
  "version_major": 2,
1158
  "version_minor": 0
1159
  },
 
1176
  "decision_tree",
1177
  "xgboost"
1178
  ],
1179
+ "code_uid": "Select.0.40.16.42-randce0898a7",
1180
  "disabled": false,
1181
  "hidden": false,
1182
  "label": "Model Selection",
1183
+ "model_id": "02c163a5f04e4dde8adda8eb149814d0",
1184
  "url_key": "",
1185
  "value": "random_forest",
1186
  "widget": "Select"
1187
  },
1188
  "application/vnd.jupyter.widget-view+json": {
1189
+ "model_id": "02c163a5f04e4dde8adda8eb149814d0",
1190
  "version_major": 2,
1191
  "version_minor": 0
1192
  },
 
1196
  },
1197
  "metadata": {},
1198
  "output_type": "display_data"
 
 
 
 
 
 
 
1199
  }
1200
  ],
1201
  "source": [
 
1291
  },
1292
  {
1293
  "cell_type": "code",
1294
+ "execution_count": 445,
1295
+ "metadata": {
1296
+ "slideshow": {
1297
+ "slide_type": "skip"
 
 
 
 
 
1298
  }
1299
+ },
1300
+ "outputs": [],
1301
  "source": [
1302
  "evaluation_score_output, evaluation_counts_output = evaluate_models(input_model)"
1303
  ]
 
1305
  {
1306
  "attachments": {},
1307
  "cell_type": "markdown",
1308
+ "metadata": {
1309
+ "slideshow": {
1310
+ "slide_type": "skip"
1311
+ }
1312
+ },
1313
  "source": [
1314
  "#### **Confusion Matrix**"
1315
  ]
1316
  },
1317
  {
1318
  "cell_type": "code",
1319
+ "execution_count": 446,
1320
+ "metadata": {
1321
+ "slideshow": {
1322
+ "slide_type": "slide"
1323
+ }
1324
+ },
1325
  "outputs": [
1326
  {
1327
+ "name": "stdout",
1328
+ "output_type": "stream",
1329
+ "text": [
1330
+ " Accuracy Precision Recall F1-score\n",
1331
+ "0 0.89 0.15 0.15 0.15\n"
1332
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1333
  },
1334
  {
1335
  "data": {
 
1354
  " show_normed=True\n",
1355
  ")\n",
1356
  "\n",
1357
+ "print(evaluation_score_output[['Accuracy', 'Precision', 'Recall', 'F1-score']])\n",
1358
+ "plt.show()"
1359
  ]
1360
  },
1361
  {
1362
  "attachments": {},
1363
  "cell_type": "markdown",
1364
+ "metadata": {
1365
+ "slideshow": {
1366
+ "slide_type": "skip"
1367
+ }
1368
+ },
1369
  "source": [
1370
  "### **Transformations Report**"
1371
  ]
1372
  },
1373
  {
1374
  "cell_type": "code",
1375
+ "execution_count": 447,
1376
+ "metadata": {
1377
+ "slideshow": {
1378
+ "slide_type": "slide"
1379
+ }
1380
+ },
1381
  "outputs": [
1382
  {
1383
  "name": "stdout",
1384
  "output_type": "stream",
1385
  "text": [
 
1386
  "FEATURE REMOVAL\n",
 
1387
  "('the number of columns dropped due to duplications is: ', 104)\n",
1388
  "('the number of columns dropped due to missing values is: ', 28)\n",
1389
  "('the number of columns dropped due to low variance is: ', 189)\n",
 
1409
  "------------------------------------------\n",
1410
  "IMBALANCE TREATMENT\n",
1411
  "('Shape of the training set after oversampling with SMOTE: ', (2194, 14))\n",
1412
+ "('Value counts of the target variable after oversampling with SMOTE: ', pass/fail\n",
1413
  "0 1097\n",
1414
  "1 1097\n",
1415
  "dtype: int64)\n"
 
1417
  }
1418
  ],
1419
  "source": [
 
1420
  "print('FEATURE REMOVAL')\n",
 
1421
  "print(feature_removal_report1)\n",
1422
  "print(feature_removal_report2)\n",
1423
  "print(feature_removal_report3)\n",