erjonb commited on
Commit
51f4ca9
·
1 Parent(s): e1dc96a

Upload P2 - Secom Notebook2 - Mercury.ipynb

Browse files
Files changed (1) hide show
  1. P2 - Secom Notebook2 - Mercury.ipynb +202 -321
P2 - Secom Notebook2 - Mercury.ipynb CHANGED
@@ -26,7 +26,7 @@
26
  },
27
  {
28
  "cell_type": "code",
29
- "execution_count": 85,
30
  "metadata": {},
31
  "outputs": [],
32
  "source": [
@@ -53,14 +53,14 @@
53
  },
54
  {
55
  "cell_type": "code",
56
- "execution_count": 86,
57
  "metadata": {},
58
  "outputs": [
59
  {
60
  "data": {
61
  "application/mercury+json": {
62
  "allow_download": true,
63
- "code_uid": "App.0.40.24.1-rand8c10e2d9",
64
  "continuous_update": false,
65
  "description": "Recumpute everything dynamically",
66
  "full_screen": true,
@@ -92,7 +92,7 @@
92
  },
93
  {
94
  "cell_type": "code",
95
- "execution_count": 87,
96
  "metadata": {},
97
  "outputs": [],
98
  "source": [
@@ -129,24 +129,24 @@
129
  },
130
  {
131
  "cell_type": "code",
132
- "execution_count": 88,
133
  "metadata": {},
134
  "outputs": [
135
  {
136
  "data": {
137
  "application/mercury+json": {
138
- "code_uid": "Text.0.40.15.11-rand39f89858",
139
  "disabled": false,
140
  "hidden": false,
141
  "label": "Test Size Ratio",
142
- "model_id": "271115d337014695a05d7e83307b4cc4",
143
  "rows": 1,
144
  "url_key": "",
145
  "value": "0.25",
146
  "widget": "Text"
147
  },
148
  "application/vnd.jupyter.widget-view+json": {
149
- "model_id": "271115d337014695a05d7e83307b4cc4",
150
  "version_major": 2,
151
  "version_minor": 0
152
  },
@@ -160,18 +160,18 @@
160
  {
161
  "data": {
162
  "application/mercury+json": {
163
- "code_uid": "Text.0.40.15.14-randf159337c",
164
  "disabled": false,
165
  "hidden": false,
166
  "label": "Random State Integer",
167
- "model_id": "87a237754fa24e11a17700de955552a8",
168
  "rows": 1,
169
  "url_key": "",
170
  "value": "13",
171
  "widget": "Text"
172
  },
173
  "application/vnd.jupyter.widget-view+json": {
174
- "model_id": "87a237754fa24e11a17700de955552a8",
175
  "version_major": 2,
176
  "version_minor": 0
177
  },
@@ -220,31 +220,37 @@
220
  },
221
  {
222
  "cell_type": "code",
223
- "execution_count": 89,
224
  "metadata": {},
225
  "outputs": [],
226
  "source": [
227
  "def columns_to_drop(df,drop_duplicates='yes', missing_values_threshold=100, variance_threshold=0, \n",
228
  " correlation_threshold=1.1):\n",
229
  " \n",
230
- " print('------------------------------------------')\n",
231
- " print('FEATURE REMOVAL')\n",
 
 
 
 
 
232
  " \n",
233
- " print('Shape of the dataframe is: ', df.shape)\n",
 
234
  "\n",
235
  " # Drop duplicated columns\n",
236
  " if drop_duplicates == 'yes':\n",
237
  " new_column_names = df.columns\n",
238
  " df = df.T.drop_duplicates().T\n",
239
- " print('the number of columns dropped due to duplications is: ', len(new_column_names) - len(df.columns))\n",
240
  " drop_duplicated = list(set(new_column_names) - set(df.columns))\n",
241
  "\n",
242
  " elif drop_duplicates == 'no':\n",
243
  " df = df.T.T\n",
244
- " print('No columns were dropped due to duplications') \n",
245
  "\n",
246
  " # Print the percentage of columns in df with missing values more than or equal to threshold\n",
247
- " print('the number of columns dropped due to missing values is: ', len(df.isnull().mean()[df.isnull().mean() > missing_values_threshold/100].index))\n",
248
  " \n",
249
  " # Print into a list the columns to be dropped due to missing values\n",
250
  " drop_missing = list(df.isnull().mean()[df.isnull().mean() > missing_values_threshold/100].index)\n",
@@ -253,7 +259,7 @@
253
  " df.drop(drop_missing, axis=1, inplace=True)\n",
254
  " \n",
255
  " # Print the number of columns in df with variance less than threshold\n",
256
- " print('the number of columns dropped due to low variance is: ', len(df.var()[df.var() <= variance_threshold].index))\n",
257
  "\n",
258
  " # Print into a list the columns to be dropped due to low variance\n",
259
  " drop_variance = list(df.var()[df.var() <= variance_threshold].index)\n",
@@ -267,7 +273,7 @@
267
  " corr_matrix = df.corr().abs().round(4)\n",
268
  " upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n",
269
  " to_drop = [column for column in upper.columns if any(upper[column] >= correlation_threshold)]\n",
270
- " print('the number of columns dropped due to high correlation is: ', len(to_drop))\n",
271
  "\n",
272
  " # Print into a list the columns to be dropped due to high correlation\n",
273
  " drop_correlation = [column for column in upper.columns if any(upper[column] >= correlation_threshold)]\n",
@@ -281,8 +287,8 @@
281
  " elif drop_duplicates =='no':\n",
282
  " dropped = (drop_missing+drop_variance+drop_correlation)\n",
283
  " \n",
284
- " print('Total number of columns to be dropped is: ', len(dropped))\n",
285
- " print('New shape of the dataframe is: ', df.shape)\n",
286
  "\n",
287
  " global drop_duplicates_var\n",
288
  " drop_duplicates_var = drop_duplicates\n",
@@ -314,24 +320,24 @@
314
  },
315
  {
316
  "cell_type": "code",
317
- "execution_count": 90,
318
  "metadata": {},
319
  "outputs": [],
320
  "source": [
321
  "def outlier_removal(z_df, z_threshold=4):\n",
322
  " \n",
323
  " global outlier_var\n",
 
 
324
  "\n",
325
- " print('------------------------------------------')\n",
326
- " print('OUTLIER REMOVAL')\n",
327
  "\n",
328
  " if z_threshold == 'none':\n",
329
- " print('No outliers were removed')\n",
330
  " outlier_var = 'none'\n",
331
  " return z_df\n",
332
  " \n",
333
  " else:\n",
334
- " print('The z-score threshold is:', z_threshold)\n",
335
  "\n",
336
  " z_df_copy = z_df.copy()\n",
337
  "\n",
@@ -342,11 +348,10 @@
342
  " z_df_copy[outliers_mask] = np.nan\n",
343
  "\n",
344
  " outliers_count = np.count_nonzero(outliers_mask)\n",
345
- " print('The number of outliers removed from the dataset is:', outliers_count)\n",
346
  "\n",
347
  " outlier_var = z_threshold\n",
348
  "\n",
349
- " print(type(z_df_copy))\n",
350
  " return z_df_copy"
351
  ]
352
  },
@@ -364,7 +369,7 @@
364
  },
365
  {
366
  "cell_type": "code",
367
- "execution_count": 91,
368
  "metadata": {},
369
  "outputs": [],
370
  "source": [
@@ -373,9 +378,7 @@
373
  "def scale_dataframe(scale_model,df_fit, df_transform):\n",
374
  " \n",
375
  " global scale_model_var\n",
376
- "\n",
377
- " print('------------------------------------------')\n",
378
- " print('SCALING THE DATAFRAME')\n",
379
  "\n",
380
  " if scale_model == 'robust':\n",
381
  " from sklearn.preprocessing import RobustScaler\n",
@@ -383,7 +386,7 @@
383
  " scaler.fit(df_fit)\n",
384
  " df_scaled = scaler.transform(df_transform)\n",
385
  " df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
386
- " print('The dataframe has been scaled using the robust scaling model')\n",
387
  " scale_model_var = 'robust'\n",
388
  " return df_scaled\n",
389
  " \n",
@@ -393,7 +396,7 @@
393
  " scaler.fit(df_fit)\n",
394
  " df_scaled = scaler.transform(df_transform)\n",
395
  " df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
396
- " print('The dataframe has been scaled using the standard scaling model')\n",
397
  " scale_model_var = 'standard'\n",
398
  " return df_scaled\n",
399
  " \n",
@@ -403,7 +406,7 @@
403
  " scaler.fit(df_fit)\n",
404
  " df_scaled = scaler.transform(df_transform)\n",
405
  " df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
406
- " print('The dataframe has been scaled using the normal scaling model')\n",
407
  " scale_model_var = 'normal'\n",
408
  " return df_scaled\n",
409
  " \n",
@@ -413,12 +416,12 @@
413
  " scaler.fit(df_fit)\n",
414
  " df_scaled = scaler.transform(df_transform)\n",
415
  " df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
416
- " print('The dataframe has been scaled using the minmax scaling model')\n",
417
  " scale_model_var = 'minmax'\n",
418
  " return df_scaled\n",
419
  " \n",
420
  " elif scale_model == 'none':\n",
421
- " print('The dataframe has not been scaled')\n",
422
  " scale_model_var = 'none'\n",
423
  " return df_transform\n",
424
  " \n",
@@ -441,7 +444,7 @@
441
  },
442
  {
443
  "cell_type": "code",
444
- "execution_count": 92,
445
  "metadata": {},
446
  "outputs": [],
447
  "source": [
@@ -449,11 +452,14 @@
449
  "\n",
450
  "def impute_missing_values(imputation, df_fit, df_transform, n_neighbors=5):\n",
451
  "\n",
452
- " print('------------------------------------------')\n",
453
- " print('IMPUTATION PROCESS')\n",
454
- " print('Number of missing values before imputation: ', df_transform.isnull().sum().sum())\n",
455
- "\n",
456
  " global imputation_var\n",
 
 
 
 
 
 
 
457
  "\n",
458
  " if imputation == 'knn':\n",
459
  "\n",
@@ -462,8 +468,8 @@
462
  " imputer.fit(df_fit)\n",
463
  " df_imputed = imputer.transform(df_transform)\n",
464
  " df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
465
- " print('knn imputation has been applied') \n",
466
- " print('Number of missing values after imputation: ', df_imputed.isnull().sum().sum())\n",
467
  " imputation_var = 'knn'\n",
468
  " return df_imputed\n",
469
  " \n",
@@ -474,8 +480,8 @@
474
  " imputer.fit(df_fit)\n",
475
  " df_imputed = imputer.transform(df_transform)\n",
476
  " df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
477
- " print('mean imputation has been applied')\n",
478
- " print('Number of missing values after imputation: ', df_imputed.isnull().sum().sum())\n",
479
  " imputation_var = 'mean'\n",
480
  " return df_imputed\n",
481
  " \n",
@@ -486,8 +492,8 @@
486
  " imputer.fit(df_fit)\n",
487
  " df_imputed = imputer.transform(df_transform)\n",
488
  " df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
489
- " print('median imputation has been applied')\n",
490
- " print('Number of missing values after imputation: ', df_imputed.isnull().sum().sum())\n",
491
  " imputation_var = 'median'\n",
492
  " return df_imputed\n",
493
  " \n",
@@ -498,8 +504,8 @@
498
  " imputer.fit(df_fit)\n",
499
  " df_imputed = imputer.transform(df_transform)\n",
500
  " df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
501
- " print('most frequent imputation has been applied')\n",
502
- " print('Number of missing values after imputation: ', df_imputed.isnull().sum().sum())\n",
503
  " imputation_var = 'most_frequent'\n",
504
  " return df_imputed\n",
505
  " \n",
@@ -523,7 +529,7 @@
523
  },
524
  {
525
  "cell_type": "code",
526
- "execution_count": 93,
527
  "metadata": {},
528
  "outputs": [],
529
  "source": [
@@ -531,14 +537,15 @@
531
  "\n",
532
  " global feature_selection_var\n",
533
  " global selected_features\n",
 
 
 
534
  "\n",
535
- " print('------------------------------------------')\n",
536
- " print('FEATURE SELECTION')\n",
537
  "\n",
538
  " # if method is boruta, run boruta feature selection and return the selected features and the training set with only the selected features\n",
539
  "\n",
540
  " if method == 'boruta':\n",
541
- " print('Selected method is: ', method)\n",
542
  " from boruta import BorutaPy\n",
543
  " from sklearn.ensemble import RandomForestClassifier\n",
544
  " rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)\n",
@@ -547,48 +554,48 @@
547
  " selected_feature_indices = boruta_selector.support_\n",
548
  " selected_columns = X_train.columns[selected_feature_indices]\n",
549
  " X_train_filtered = X_train.iloc[:, selected_feature_indices]\n",
550
- " print('Shape of the training set after feature selection with Boruta: ', X_train_filtered.shape)\n",
551
  " return X_train_filtered, selected_columns\n",
552
  " \n",
553
  " if method == 'none':\n",
554
- " print('Selected method is: ', method)\n",
555
  " X_train_filtered = X_train\n",
556
- " print('Shape of the training set after no feature selection: ', X_train_filtered.shape)\n",
557
  " feature_selection_var = 'none'\n",
558
  " selected_features = X_train_filtered.columns\n",
559
  " return X_train_filtered, selected_features \n",
560
  " \n",
561
  " if method == 'lasso':\n",
562
- " print('Selected method is: ', method)\n",
563
  " from sklearn.linear_model import LassoCV\n",
564
  " from sklearn.feature_selection import SelectFromModel\n",
565
  " lasso = LassoCV().fit(X_train, y_train)\n",
566
  " model = SelectFromModel(lasso, prefit=True)\n",
567
  " X_train_filtered = model.transform(X_train)\n",
568
  " selected_features = X_train.columns[model.get_support()]\n",
569
- " print('Shape of the training set after feature selection with LassoCV: ', X_train_filtered.shape)\n",
570
  " feature_selection_var = 'lasso'\n",
571
  " return X_train_filtered, selected_features\n",
572
  " \n",
573
  " if method == 'pca':\n",
574
- " print('Selected method is: ', method)\n",
575
  " from sklearn.decomposition import PCA\n",
576
  " pca = PCA(n_components=15)\n",
577
  " X_train_pca = pca.fit_transform(X_train)\n",
578
  " selected_features = X_train.columns[pca.explained_variance_ratio_.argsort()[::-1]][:15]\n",
579
- " print('Shape of the training set after feature selection with PCA: ', X_train_pca.shape)\n",
580
  " feature_selection_var = 'pca'\n",
581
  " return X_train_pca, selected_features\n",
582
  " \n",
583
  " if method == 'rfe':\n",
584
- " print('Selected method is: ', method)\n",
585
  " from sklearn.feature_selection import RFE\n",
586
  " from sklearn.ensemble import RandomForestClassifier\n",
587
  " rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, n_jobs=-1), n_features_to_select=15, step=10, verbose=0)\n",
588
  " rfe_selector.fit(X_train, y_train)\n",
589
  " selected_features = X_train.columns[rfe_selector.support_]\n",
590
  " X_train_filtered = X_train.iloc[:, rfe_selector.support_]\n",
591
- " print('Shape of the training set after feature selection with RFE: ', X_train_filtered.shape)\n",
592
  " feature_selection_var = 'rfe'\n",
593
  " return X_train_filtered, selected_features\n",
594
  " "
@@ -608,7 +615,7 @@
608
  },
609
  {
610
  "cell_type": "code",
611
- "execution_count": 94,
612
  "metadata": {},
613
  "outputs": [],
614
  "source": [
@@ -617,16 +624,15 @@
617
  "def imbalance_treatment(method, X_train, y_train):\n",
618
  "\n",
619
  " global imbalance_var\n",
620
- "\n",
621
- " print('------------------------------------------')\n",
622
- " print('IMBALANCE TREATMENT')\n",
623
  "\n",
624
  " if method == 'smote': \n",
625
  " from imblearn.over_sampling import SMOTE\n",
626
  " sm = SMOTE(random_state=42)\n",
627
  " X_train_res, y_train_res = sm.fit_resample(X_train, y_train)\n",
628
- " print('Shape of the training set after oversampling with SMOTE: ', X_train_res.shape)\n",
629
- " print('Value counts of the target variable after oversampling with SMOTE: \\n', y_train_res.value_counts())\n",
630
  " imbalance_var = 'smote'\n",
631
  " return X_train_res, y_train_res\n",
632
  " \n",
@@ -634,8 +640,8 @@
634
  " from imblearn.under_sampling import RandomUnderSampler\n",
635
  " rus = RandomUnderSampler(random_state=42)\n",
636
  " X_train_res, y_train_res = rus.fit_resample(X_train, y_train)\n",
637
- " print('Shape of the training set after undersampling with RandomUnderSampler: ', X_train_res.shape)\n",
638
- " print('Value counts of the target variable after undersampling with RandomUnderSampler: \\n', y_train_res.value_counts())\n",
639
  " imbalance_var = 'undersampling'\n",
640
  " return X_train_res, y_train_res\n",
641
  " \n",
@@ -643,8 +649,8 @@
643
  " from imblearn.over_sampling import RandomOverSampler\n",
644
  " ros = RandomOverSampler(random_state=42)\n",
645
  " X_train_res, y_train_res = ros.fit_resample(X_train, y_train)\n",
646
- " print('Shape of the training set after oversampling with RandomOverSampler: ', X_train_res.shape)\n",
647
- " print('Value counts of the target variable after oversampling with RandomOverSampler: \\n', y_train_res.value_counts())\n",
648
  " imbalance_var = 'rose'\n",
649
  " return X_train_res, y_train_res\n",
650
  " \n",
@@ -652,8 +658,8 @@
652
  " if method == 'none':\n",
653
  " X_train_res = X_train\n",
654
  " y_train_res = y_train\n",
655
- " print('Shape of the training set after no resampling: ', X_train_res.shape)\n",
656
- " print('Value counts of the target variable after no resampling: \\n', y_train_res.value_counts())\n",
657
  " imbalance_var = 'none'\n",
658
  " return X_train_res, y_train_res\n",
659
  " \n",
@@ -678,7 +684,7 @@
678
  },
679
  {
680
  "cell_type": "code",
681
- "execution_count": 95,
682
  "metadata": {},
683
  "outputs": [],
684
  "source": [
@@ -751,7 +757,7 @@
751
  },
752
  {
753
  "cell_type": "code",
754
- "execution_count": 96,
755
  "metadata": {},
756
  "outputs": [],
757
  "source": [
@@ -773,7 +779,7 @@
773
  },
774
  {
775
  "cell_type": "code",
776
- "execution_count": 101,
777
  "metadata": {},
778
  "outputs": [],
779
  "source": [
@@ -876,24 +882,24 @@
876
  },
877
  {
878
  "cell_type": "code",
879
- "execution_count": 103,
880
  "metadata": {},
881
  "outputs": [
882
  {
883
  "data": {
884
  "application/mercury+json": {
885
- "code_uid": "Text.0.40.15.8-rand3ea159f1",
886
  "disabled": false,
887
  "hidden": false,
888
  "label": "Missing Value Threeshold",
889
- "model_id": "1f155017a0e64a71ba2f1a737d93c61d",
890
  "rows": 1,
891
  "url_key": "",
892
  "value": "50",
893
  "widget": "Text"
894
  },
895
  "application/vnd.jupyter.widget-view+json": {
896
- "model_id": "1f155017a0e64a71ba2f1a737d93c61d",
897
  "version_major": 2,
898
  "version_minor": 0
899
  },
@@ -907,18 +913,18 @@
907
  {
908
  "data": {
909
  "application/mercury+json": {
910
- "code_uid": "Text.0.40.15.11-rand7772c265",
911
  "disabled": false,
912
  "hidden": false,
913
  "label": "Variance Threshold",
914
- "model_id": "6ae03558245f4658abc409261e88e273",
915
  "rows": 1,
916
  "url_key": "",
917
  "value": "0.05",
918
  "widget": "Text"
919
  },
920
  "application/vnd.jupyter.widget-view+json": {
921
- "model_id": "6ae03558245f4658abc409261e88e273",
922
  "version_major": 2,
923
  "version_minor": 0
924
  },
@@ -932,18 +938,18 @@
932
  {
933
  "data": {
934
  "application/mercury+json": {
935
- "code_uid": "Text.0.40.15.14-rand52a68a07",
936
  "disabled": false,
937
  "hidden": false,
938
  "label": "Correlation Threshold",
939
- "model_id": "325fdc5cafb046c89be6440a2a0e855c",
940
  "rows": 1,
941
  "url_key": "",
942
  "value": "0.95",
943
  "widget": "Text"
944
  },
945
  "application/vnd.jupyter.widget-view+json": {
946
- "model_id": "325fdc5cafb046c89be6440a2a0e855c",
947
  "version_major": 2,
948
  "version_minor": 0
949
  },
@@ -963,17 +969,17 @@
963
  4,
964
  5
965
  ],
966
- "code_uid": "Select.0.40.16.18-randd219fc2a",
967
  "disabled": false,
968
  "hidden": false,
969
  "label": "Outlier Removal Threshold",
970
- "model_id": "96591acea02b43f9a366ddfdfff1dfb5",
971
  "url_key": "",
972
  "value": 5,
973
  "widget": "Select"
974
  },
975
  "application/vnd.jupyter.widget-view+json": {
976
- "model_id": "96591acea02b43f9a366ddfdfff1dfb5",
977
  "version_major": 2,
978
  "version_minor": 0
979
  },
@@ -993,17 +999,17 @@
993
  "minmax",
994
  "robust"
995
  ],
996
- "code_uid": "Select.0.40.16.25-rand7528f0a1",
997
  "disabled": false,
998
  "hidden": false,
999
  "label": "Scaling Variables",
1000
- "model_id": "8fc6da55b85c4993bf420c86d2a23a2d",
1001
  "url_key": "",
1002
  "value": "standard",
1003
  "widget": "Select"
1004
  },
1005
  "application/vnd.jupyter.widget-view+json": {
1006
- "model_id": "8fc6da55b85c4993bf420c86d2a23a2d",
1007
  "version_major": 2,
1008
  "version_minor": 0
1009
  },
@@ -1023,17 +1029,17 @@
1023
  "knn",
1024
  "most_frequent"
1025
  ],
1026
- "code_uid": "Select.0.40.16.29-randb1225c9d",
1027
  "disabled": false,
1028
  "hidden": false,
1029
  "label": "Imputation Methods",
1030
- "model_id": "193450229f4543079d7a53267d8c1fe1",
1031
  "url_key": "",
1032
  "value": "median",
1033
  "widget": "Select"
1034
  },
1035
  "application/vnd.jupyter.widget-view+json": {
1036
- "model_id": "193450229f4543079d7a53267d8c1fe1",
1037
  "version_major": 2,
1038
  "version_minor": 0
1039
  },
@@ -1054,17 +1060,17 @@
1054
  "pca",
1055
  "boruta"
1056
  ],
1057
- "code_uid": "Select.0.40.16.34-rand9ae51452",
1058
  "disabled": false,
1059
  "hidden": false,
1060
  "label": "Feature Selection",
1061
- "model_id": "e556e5025dc14e9e867125680d35025f",
1062
  "url_key": "",
1063
  "value": "lasso",
1064
  "widget": "Select"
1065
  },
1066
  "application/vnd.jupyter.widget-view+json": {
1067
- "model_id": "e556e5025dc14e9e867125680d35025f",
1068
  "version_major": 2,
1069
  "version_minor": 0
1070
  },
@@ -1084,17 +1090,17 @@
1084
  "undersampling",
1085
  "rose"
1086
  ],
1087
- "code_uid": "Select.0.40.16.38-rand84f919f9",
1088
  "disabled": false,
1089
  "hidden": false,
1090
  "label": "Imbalance Treatment",
1091
- "model_id": "29560b71dbf84b45a2a487c92c077ad4",
1092
  "url_key": "",
1093
  "value": "smote",
1094
  "widget": "Select"
1095
  },
1096
  "application/vnd.jupyter.widget-view+json": {
1097
- "model_id": "29560b71dbf84b45a2a487c92c077ad4",
1098
  "version_major": 2,
1099
  "version_minor": 0
1100
  },
@@ -1117,17 +1123,17 @@
1117
  "decision_tree",
1118
  "xgboost"
1119
  ],
1120
- "code_uid": "Select.0.40.16.42-rand98b2dc54",
1121
  "disabled": false,
1122
  "hidden": false,
1123
  "label": "Model Selection",
1124
- "model_id": "d90f6d906955444e9abf735d40442d91",
1125
  "url_key": "",
1126
  "value": "random_forest",
1127
  "widget": "Select"
1128
  },
1129
  "application/vnd.jupyter.widget-view+json": {
1130
- "model_id": "d90f6d906955444e9abf735d40442d91",
1131
  "version_major": 2,
1132
  "version_minor": 0
1133
  },
@@ -1142,49 +1148,7 @@
1142
  "name": "stdout",
1143
  "output_type": "stream",
1144
  "text": [
1145
- "------------------------------------------\n",
1146
- "FEATURE REMOVAL\n",
1147
- "Shape of the dataframe is: (1175, 590)\n",
1148
- "the number of columns dropped due to duplications is: 104\n",
1149
- "the number of columns dropped due to missing values is: 28\n",
1150
- "the number of columns dropped due to low variance is: 189\n",
1151
- "the number of columns dropped due to high correlation is: 90\n",
1152
- "Total number of columns to be dropped is: 411\n",
1153
- "New shape of the dataframe is: (1175, 179)\n",
1154
- "<class 'list'>\n",
1155
- "------------------------------------------\n",
1156
- "OUTLIER REMOVAL\n",
1157
- "The z-score threshold is: 5\n",
1158
- "The number of outliers removed from the dataset is: 163\n",
1159
- "<class 'pandas.core.frame.DataFrame'>\n",
1160
- "------------------------------------------\n",
1161
- "SCALING THE DATAFRAME\n",
1162
- "The dataframe has been scaled using the standard scaling model\n",
1163
- "------------------------------------------\n",
1164
- "SCALING THE DATAFRAME\n",
1165
- "The dataframe has been scaled using the standard scaling model\n",
1166
- "------------------------------------------\n",
1167
- "IMPUTATION PROCESS\n",
1168
- "Number of missing values before imputation: 3380\n",
1169
- "median imputation has been applied\n",
1170
- "Number of missing values after imputation: 0\n",
1171
- "------------------------------------------\n",
1172
- "IMPUTATION PROCESS\n",
1173
- "Number of missing values before imputation: 1196\n",
1174
- "median imputation has been applied\n",
1175
- "Number of missing values after imputation: 0\n",
1176
- "------------------------------------------\n",
1177
- "FEATURE SELECTION\n",
1178
- "Selected method is: lasso\n",
1179
- "Shape of the training set after feature selection with LassoCV: (1175, 14)\n",
1180
- "------------------------------------------\n",
1181
- "IMBALANCE TREATMENT\n",
1182
- "Shape of the training set after oversampling with SMOTE: (2194, 14)\n",
1183
- "Value counts of the target variable after oversampling with SMOTE: \n",
1184
- " pass/fail\n",
1185
- "0 1097\n",
1186
- "1 1097\n",
1187
- "dtype: int64\n"
1188
  ]
1189
  }
1190
  ],
@@ -1281,7 +1245,7 @@
1281
  },
1282
  {
1283
  "cell_type": "code",
1284
- "execution_count": 113,
1285
  "metadata": {},
1286
  "outputs": [
1287
  {
@@ -1290,165 +1254,6 @@
1290
  "text": [
1291
  "--------------------------------------------------\n"
1292
  ]
1293
- },
1294
- {
1295
- "data": {
1296
- "text/html": [
1297
- "<div>\n",
1298
- "<style scoped>\n",
1299
- " .dataframe tbody tr th:only-of-type {\n",
1300
- " vertical-align: middle;\n",
1301
- " }\n",
1302
- "\n",
1303
- " .dataframe tbody tr th {\n",
1304
- " vertical-align: top;\n",
1305
- " }\n",
1306
- "\n",
1307
- " .dataframe thead th {\n",
1308
- " text-align: right;\n",
1309
- " }\n",
1310
- "</style>\n",
1311
- "<table border=\"1\" class=\"dataframe\">\n",
1312
- " <thead>\n",
1313
- " <tr style=\"text-align: right;\">\n",
1314
- " <th></th>\n",
1315
- " <th>Model</th>\n",
1316
- " <th>True Negatives</th>\n",
1317
- " <th>False Positives</th>\n",
1318
- " <th>False Negatives</th>\n",
1319
- " <th>True Positives</th>\n",
1320
- " <th>drop duplicates</th>\n",
1321
- " <th>missing values th</th>\n",
1322
- " <th>variance th</th>\n",
1323
- " <th>correlation th</th>\n",
1324
- " <th>outlier removal th</th>\n",
1325
- " <th>scaling method</th>\n",
1326
- " <th>imputation method</th>\n",
1327
- " <th>feature selection</th>\n",
1328
- " <th>imbalance treatment</th>\n",
1329
- " <th>model_variables</th>\n",
1330
- " </tr>\n",
1331
- " </thead>\n",
1332
- " <tbody>\n",
1333
- " <tr>\n",
1334
- " <th>0</th>\n",
1335
- " <td>random_forest</td>\n",
1336
- " <td>344</td>\n",
1337
- " <td>22</td>\n",
1338
- " <td>22</td>\n",
1339
- " <td>4</td>\n",
1340
- " <td>yes</td>\n",
1341
- " <td>50</td>\n",
1342
- " <td>0.05</td>\n",
1343
- " <td>0.95</td>\n",
1344
- " <td>5</td>\n",
1345
- " <td>standard</td>\n",
1346
- " <td>median</td>\n",
1347
- " <td>lasso</td>\n",
1348
- " <td>smote</td>\n",
1349
- " <td>yes_50_0.05_0.95_5_standard_median_lasso_smote</td>\n",
1350
- " </tr>\n",
1351
- " </tbody>\n",
1352
- "</table>\n",
1353
- "</div>"
1354
- ],
1355
- "text/plain": [
1356
- " Model True Negatives False Positives False Negatives \\\n",
1357
- "0 random_forest 344 22 22 \n",
1358
- "\n",
1359
- " True Positives drop duplicates missing values th variance th \\\n",
1360
- "0 4 yes 50 0.05 \n",
1361
- "\n",
1362
- " correlation th outlier removal th scaling method imputation method \\\n",
1363
- "0 0.95 5 standard median \n",
1364
- "\n",
1365
- " feature selection imbalance treatment \\\n",
1366
- "0 lasso smote \n",
1367
- "\n",
1368
- " model_variables \n",
1369
- "0 yes_50_0.05_0.95_5_standard_median_lasso_smote "
1370
- ]
1371
- },
1372
- "metadata": {},
1373
- "output_type": "display_data"
1374
- },
1375
- {
1376
- "data": {
1377
- "text/html": [
1378
- "<div>\n",
1379
- "<style scoped>\n",
1380
- " .dataframe tbody tr th:only-of-type {\n",
1381
- " vertical-align: middle;\n",
1382
- " }\n",
1383
- "\n",
1384
- " .dataframe tbody tr th {\n",
1385
- " vertical-align: top;\n",
1386
- " }\n",
1387
- "\n",
1388
- " .dataframe thead th {\n",
1389
- " text-align: right;\n",
1390
- " }\n",
1391
- "</style>\n",
1392
- "<table border=\"1\" class=\"dataframe\">\n",
1393
- " <thead>\n",
1394
- " <tr style=\"text-align: right;\">\n",
1395
- " <th></th>\n",
1396
- " <th>Model</th>\n",
1397
- " <th>Accuracy</th>\n",
1398
- " <th>Precision</th>\n",
1399
- " <th>Recall</th>\n",
1400
- " <th>F1-score</th>\n",
1401
- " <th>drop duplicates</th>\n",
1402
- " <th>missing values th</th>\n",
1403
- " <th>variance th</th>\n",
1404
- " <th>correlation th</th>\n",
1405
- " <th>outlier removal th</th>\n",
1406
- " <th>scaling method</th>\n",
1407
- " <th>imputation method</th>\n",
1408
- " <th>feature selection</th>\n",
1409
- " <th>imbalance treatment</th>\n",
1410
- " <th>model_variables</th>\n",
1411
- " </tr>\n",
1412
- " </thead>\n",
1413
- " <tbody>\n",
1414
- " <tr>\n",
1415
- " <th>0</th>\n",
1416
- " <td>random_forest</td>\n",
1417
- " <td>0.89</td>\n",
1418
- " <td>0.15</td>\n",
1419
- " <td>0.15</td>\n",
1420
- " <td>0.15</td>\n",
1421
- " <td>yes</td>\n",
1422
- " <td>50</td>\n",
1423
- " <td>0.05</td>\n",
1424
- " <td>0.95</td>\n",
1425
- " <td>5</td>\n",
1426
- " <td>standard</td>\n",
1427
- " <td>median</td>\n",
1428
- " <td>lasso</td>\n",
1429
- " <td>smote</td>\n",
1430
- " <td>yes_50_0.05_0.95_5_standard_median_lasso_smote</td>\n",
1431
- " </tr>\n",
1432
- " </tbody>\n",
1433
- "</table>\n",
1434
- "</div>"
1435
- ],
1436
- "text/plain": [
1437
- " Model Accuracy Precision Recall F1-score drop duplicates \\\n",
1438
- "0 random_forest 0.89 0.15 0.15 0.15 yes \n",
1439
- "\n",
1440
- " missing values th variance th correlation th outlier removal th \\\n",
1441
- "0 50 0.05 0.95 5 \n",
1442
- "\n",
1443
- " scaling method imputation method feature selection imbalance treatment \\\n",
1444
- "0 standard median lasso smote \n",
1445
- "\n",
1446
- " model_variables \n",
1447
- "0 yes_50_0.05_0.95_5_standard_median_lasso_smote "
1448
- ]
1449
- },
1450
- "metadata": {},
1451
- "output_type": "display_data"
1452
  }
1453
  ],
1454
  "source": [
@@ -1458,23 +1263,15 @@
1458
  {
1459
  "attachments": {},
1460
  "cell_type": "markdown",
1461
- "metadata": {
1462
- "slideshow": {
1463
- "slide_type": "skip"
1464
- }
1465
- },
1466
  "source": [
1467
  "#### **Confusion Matrix**"
1468
  ]
1469
  },
1470
  {
1471
  "cell_type": "code",
1472
- "execution_count": 125,
1473
- "metadata": {
1474
- "slideshow": {
1475
- "slide_type": "slide"
1476
- }
1477
- },
1478
  "outputs": [
1479
  {
1480
  "data": {
@@ -1548,6 +1345,90 @@
1548
  "\n",
1549
  "display(evaluation_score_output[['Accuracy', 'Precision', 'Recall', 'F1-score']])"
1550
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1551
  }
1552
  ],
1553
  "metadata": {
 
26
  },
27
  {
28
  "cell_type": "code",
29
+ "execution_count": 357,
30
  "metadata": {},
31
  "outputs": [],
32
  "source": [
 
53
  },
54
  {
55
  "cell_type": "code",
56
+ "execution_count": 358,
57
  "metadata": {},
58
  "outputs": [
59
  {
60
  "data": {
61
  "application/mercury+json": {
62
  "allow_download": true,
63
+ "code_uid": "App.0.40.24.1-randd9fe9ae5",
64
  "continuous_update": false,
65
  "description": "Recumpute everything dynamically",
66
  "full_screen": true,
 
92
  },
93
  {
94
  "cell_type": "code",
95
+ "execution_count": 359,
96
  "metadata": {},
97
  "outputs": [],
98
  "source": [
 
129
  },
130
  {
131
  "cell_type": "code",
132
+ "execution_count": 360,
133
  "metadata": {},
134
  "outputs": [
135
  {
136
  "data": {
137
  "application/mercury+json": {
138
+ "code_uid": "Text.0.40.15.11-randec98731b",
139
  "disabled": false,
140
  "hidden": false,
141
  "label": "Test Size Ratio",
142
+ "model_id": "2157a02ec6544d86bd12bf1e3a15f65e",
143
  "rows": 1,
144
  "url_key": "",
145
  "value": "0.25",
146
  "widget": "Text"
147
  },
148
  "application/vnd.jupyter.widget-view+json": {
149
+ "model_id": "2157a02ec6544d86bd12bf1e3a15f65e",
150
  "version_major": 2,
151
  "version_minor": 0
152
  },
 
160
  {
161
  "data": {
162
  "application/mercury+json": {
163
+ "code_uid": "Text.0.40.15.14-randfa24ca10",
164
  "disabled": false,
165
  "hidden": false,
166
  "label": "Random State Integer",
167
+ "model_id": "cdaf85c404494bae95a32286425b9034",
168
  "rows": 1,
169
  "url_key": "",
170
  "value": "13",
171
  "widget": "Text"
172
  },
173
  "application/vnd.jupyter.widget-view+json": {
174
+ "model_id": "cdaf85c404494bae95a32286425b9034",
175
  "version_major": 2,
176
  "version_minor": 0
177
  },
 
220
  },
221
  {
222
  "cell_type": "code",
223
+ "execution_count": 361,
224
  "metadata": {},
225
  "outputs": [],
226
  "source": [
227
  "def columns_to_drop(df,drop_duplicates='yes', missing_values_threshold=100, variance_threshold=0, \n",
228
  " correlation_threshold=1.1):\n",
229
  " \n",
230
+ " global feature_removal_report0\n",
231
+ " global feature_removal_report1\n",
232
+ " global feature_removal_report2\n",
233
+ " global feature_removal_report3\n",
234
+ " global feature_removal_report4\n",
235
+ " global feature_removal_report5\n",
236
+ " global feature_removal_report6\n",
237
  " \n",
238
+ " \n",
239
+ " feature_removal_report0 = 'Shape of the dataframe is:' , df.shape\n",
240
  "\n",
241
  " # Drop duplicated columns\n",
242
  " if drop_duplicates == 'yes':\n",
243
  " new_column_names = df.columns\n",
244
  " df = df.T.drop_duplicates().T\n",
245
+ " feature_removal_report1 = 'the number of columns dropped due to duplications is: ', len(new_column_names) - len(df.columns)\n",
246
  " drop_duplicated = list(set(new_column_names) - set(df.columns))\n",
247
  "\n",
248
  " elif drop_duplicates == 'no':\n",
249
  " df = df.T.T\n",
250
+ " feature_removal_report1 = 'No columns were dropped due to duplications' \n",
251
  "\n",
252
  " # Print the percentage of columns in df with missing values more than or equal to threshold\n",
253
+ " feature_removal_report2 = 'the number of columns dropped due to missing values is: ', len(df.isnull().mean()[df.isnull().mean() > missing_values_threshold/100].index)\n",
254
  " \n",
255
  " # Print into a list the columns to be dropped due to missing values\n",
256
  " drop_missing = list(df.isnull().mean()[df.isnull().mean() > missing_values_threshold/100].index)\n",
 
259
  " df.drop(drop_missing, axis=1, inplace=True)\n",
260
  " \n",
261
  " # Print the number of columns in df with variance less than threshold\n",
262
+ " feature_removal_report3 = 'the number of columns dropped due to low variance is: ', len(df.var()[df.var() <= variance_threshold].index)\n",
263
  "\n",
264
  " # Print into a list the columns to be dropped due to low variance\n",
265
  " drop_variance = list(df.var()[df.var() <= variance_threshold].index)\n",
 
273
  " corr_matrix = df.corr().abs().round(4)\n",
274
  " upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n",
275
  " to_drop = [column for column in upper.columns if any(upper[column] >= correlation_threshold)]\n",
276
+ " feature_removal_report4 = 'the number of columns dropped due to high correlation is: ', len(to_drop)\n",
277
  "\n",
278
  " # Print into a list the columns to be dropped due to high correlation\n",
279
  " drop_correlation = [column for column in upper.columns if any(upper[column] >= correlation_threshold)]\n",
 
287
  " elif drop_duplicates =='no':\n",
288
  " dropped = (drop_missing+drop_variance+drop_correlation)\n",
289
  " \n",
290
+ " feature_removal_report5 = 'Total number of columns to be dropped is: ', len(dropped)\n",
291
+ " feature_removal_report6 = 'New shape of the dataframe is: ', df.shape\n",
292
  "\n",
293
  " global drop_duplicates_var\n",
294
  " drop_duplicates_var = drop_duplicates\n",
 
320
  },
321
  {
322
  "cell_type": "code",
323
+ "execution_count": 362,
324
  "metadata": {},
325
  "outputs": [],
326
  "source": [
327
  "def outlier_removal(z_df, z_threshold=4):\n",
328
  " \n",
329
  " global outlier_var\n",
330
+ " global outlier_removal_report0\n",
331
+ " global outlier_removal_report1\n",
332
  "\n",
 
 
333
  "\n",
334
  " if z_threshold == 'none':\n",
335
+ " outlier_removal_report0 = 'No outliers were removed'\n",
336
  " outlier_var = 'none'\n",
337
  " return z_df\n",
338
  " \n",
339
  " else:\n",
340
+ " outlier_removal_report0 = 'The z-score threshold is:', z_threshold\n",
341
  "\n",
342
  " z_df_copy = z_df.copy()\n",
343
  "\n",
 
348
  " z_df_copy[outliers_mask] = np.nan\n",
349
  "\n",
350
  " outliers_count = np.count_nonzero(outliers_mask)\n",
351
+ " outlier_removal_report1 = 'The number of outliers removed from the dataset is:', outliers_count\n",
352
  "\n",
353
  " outlier_var = z_threshold\n",
354
  "\n",
 
355
  " return z_df_copy"
356
  ]
357
  },
 
369
  },
370
  {
371
  "cell_type": "code",
372
+ "execution_count": 363,
373
  "metadata": {},
374
  "outputs": [],
375
  "source": [
 
378
  "def scale_dataframe(scale_model,df_fit, df_transform):\n",
379
  " \n",
380
  " global scale_model_var\n",
381
+ " global scaling_report0\n",
 
 
382
  "\n",
383
  " if scale_model == 'robust':\n",
384
  " from sklearn.preprocessing import RobustScaler\n",
 
386
  " scaler.fit(df_fit)\n",
387
  " df_scaled = scaler.transform(df_transform)\n",
388
  " df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
389
+ " scaling_report0 = 'The dataframe has been scaled using the robust scaling model'\n",
390
  " scale_model_var = 'robust'\n",
391
  " return df_scaled\n",
392
  " \n",
 
396
  " scaler.fit(df_fit)\n",
397
  " df_scaled = scaler.transform(df_transform)\n",
398
  " df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
399
+ " scaling_report0 = 'The dataframe has been scaled using the standard scaling model'\n",
400
  " scale_model_var = 'standard'\n",
401
  " return df_scaled\n",
402
  " \n",
 
406
  " scaler.fit(df_fit)\n",
407
  " df_scaled = scaler.transform(df_transform)\n",
408
  " df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
409
+ " scaling_report0 = 'The dataframe has been scaled using the normal scaling model'\n",
410
  " scale_model_var = 'normal'\n",
411
  " return df_scaled\n",
412
  " \n",
 
416
  " scaler.fit(df_fit)\n",
417
  " df_scaled = scaler.transform(df_transform)\n",
418
  " df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
419
+ " scaling_report0 = 'The dataframe has been scaled using the minmax scaling model'\n",
420
  " scale_model_var = 'minmax'\n",
421
  " return df_scaled\n",
422
  " \n",
423
  " elif scale_model == 'none':\n",
424
+ " scaling_report0 = 'The dataframe has not been scaled'\n",
425
  " scale_model_var = 'none'\n",
426
  " return df_transform\n",
427
  " \n",
 
444
  },
445
  {
446
  "cell_type": "code",
447
+ "execution_count": 364,
448
  "metadata": {},
449
  "outputs": [],
450
  "source": [
 
452
  "\n",
453
  "def impute_missing_values(imputation, df_fit, df_transform, n_neighbors=5):\n",
454
  "\n",
 
 
 
 
455
  " global imputation_var\n",
456
+ " global imputation_report0\n",
457
+ " global imputation_report1\n",
458
+ " global imputation_report2\n",
459
+ " global imputation_report3\n",
460
+ "\n",
461
+ " imputation_report0 = 'Number of missing values before imputation: ', df_transform.isnull().sum().sum()\n",
462
+ "\n",
463
  "\n",
464
  " if imputation == 'knn':\n",
465
  "\n",
 
468
  " imputer.fit(df_fit)\n",
469
  " df_imputed = imputer.transform(df_transform)\n",
470
  " df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
471
+ " imputation_report1 = 'knn imputation has been applied' \n",
472
+ " imputation_report2 = 'Number of missing values after imputation: ', df_imputed.isnull().sum().sum()\n",
473
  " imputation_var = 'knn'\n",
474
  " return df_imputed\n",
475
  " \n",
 
480
  " imputer.fit(df_fit)\n",
481
  " df_imputed = imputer.transform(df_transform)\n",
482
  " df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
483
+ " imputation_report1 = 'mean imputation has been applied'\n",
484
+ " imputation_report2 = 'Number of missing values after imputation: ', df_imputed.isnull().sum().sum()\n",
485
  " imputation_var = 'mean'\n",
486
  " return df_imputed\n",
487
  " \n",
 
492
  " imputer.fit(df_fit)\n",
493
  " df_imputed = imputer.transform(df_transform)\n",
494
  " df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
495
+ " imputation_report1 = 'median imputation has been applied'\n",
496
+ " imputation_report2 = 'Number of missing values after imputation: ', df_imputed.isnull().sum().sum()\n",
497
  " imputation_var = 'median'\n",
498
  " return df_imputed\n",
499
  " \n",
 
504
  " imputer.fit(df_fit)\n",
505
  " df_imputed = imputer.transform(df_transform)\n",
506
  " df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
507
+ " imputation_report1 = 'most frequent imputation has been applied'\n",
508
+ " imputation_report2 = 'Number of missing values after imputation: ', df_imputed.isnull().sum().sum()\n",
509
  " imputation_var = 'most_frequent'\n",
510
  " return df_imputed\n",
511
  " \n",
 
529
  },
530
  {
531
  "cell_type": "code",
532
+ "execution_count": 365,
533
  "metadata": {},
534
  "outputs": [],
535
  "source": [
 
537
  "\n",
538
  " global feature_selection_var\n",
539
  " global selected_features\n",
540
+ " \n",
541
+ " global feature_selection_report0\n",
542
+ " global feature_selection_report1\n",
543
  "\n",
 
 
544
  "\n",
545
  " # if method is boruta, run boruta feature selection and return the selected features and the training set with only the selected features\n",
546
  "\n",
547
  " if method == 'boruta':\n",
548
+ " feature_selection_report0 = 'Selected method is: ', method\n",
549
  " from boruta import BorutaPy\n",
550
  " from sklearn.ensemble import RandomForestClassifier\n",
551
  " rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)\n",
 
554
  " selected_feature_indices = boruta_selector.support_\n",
555
  " selected_columns = X_train.columns[selected_feature_indices]\n",
556
  " X_train_filtered = X_train.iloc[:, selected_feature_indices]\n",
557
+ " feature_selection_report1 = 'Shape of the training set after feature selection with Boruta: ', X_train_filtered.shape\n",
558
  " return X_train_filtered, selected_columns\n",
559
  " \n",
560
  " if method == 'none':\n",
561
+ " feature_selection_report = 'No feature selection has been applied'\n",
562
  " X_train_filtered = X_train\n",
563
+ " feature_selection_report = 'Shape of the training set after no feature selection: ', X_train_filtered.shape\n",
564
  " feature_selection_var = 'none'\n",
565
  " selected_features = X_train_filtered.columns\n",
566
  " return X_train_filtered, selected_features \n",
567
  " \n",
568
  " if method == 'lasso':\n",
569
+ " feature_selection_report0 = 'Selected method is: ', method\n",
570
  " from sklearn.linear_model import LassoCV\n",
571
  " from sklearn.feature_selection import SelectFromModel\n",
572
  " lasso = LassoCV().fit(X_train, y_train)\n",
573
  " model = SelectFromModel(lasso, prefit=True)\n",
574
  " X_train_filtered = model.transform(X_train)\n",
575
  " selected_features = X_train.columns[model.get_support()]\n",
576
+ " feature_selection_report1 = 'Shape of the training set after feature selection with LassoCV: ', X_train_filtered.shape\n",
577
  " feature_selection_var = 'lasso'\n",
578
  " return X_train_filtered, selected_features\n",
579
  " \n",
580
  " if method == 'pca':\n",
581
+ " feature_selection_report0 = 'Selected method is: ', method\n",
582
  " from sklearn.decomposition import PCA\n",
583
  " pca = PCA(n_components=15)\n",
584
  " X_train_pca = pca.fit_transform(X_train)\n",
585
  " selected_features = X_train.columns[pca.explained_variance_ratio_.argsort()[::-1]][:15]\n",
586
+ " feature_selection_report1 = 'Shape of the training set after feature selection with PCA: ', X_train_pca.shape\n",
587
  " feature_selection_var = 'pca'\n",
588
  " return X_train_pca, selected_features\n",
589
  " \n",
590
  " if method == 'rfe':\n",
591
+ " feature_selection_report0 = 'Selected method is: ', method\n",
592
  " from sklearn.feature_selection import RFE\n",
593
  " from sklearn.ensemble import RandomForestClassifier\n",
594
  " rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, n_jobs=-1), n_features_to_select=15, step=10, verbose=0)\n",
595
  " rfe_selector.fit(X_train, y_train)\n",
596
  " selected_features = X_train.columns[rfe_selector.support_]\n",
597
  " X_train_filtered = X_train.iloc[:, rfe_selector.support_]\n",
598
+ " feature_selection_report1 = 'Shape of the training set after feature selection with RFE: ', X_train_filtered.shape\n",
599
  " feature_selection_var = 'rfe'\n",
600
  " return X_train_filtered, selected_features\n",
601
  " "
 
615
  },
616
  {
617
  "cell_type": "code",
618
+ "execution_count": 366,
619
  "metadata": {},
620
  "outputs": [],
621
  "source": [
 
624
  "def imbalance_treatment(method, X_train, y_train):\n",
625
  "\n",
626
  " global imbalance_var\n",
627
+ " global imbalance_report0\n",
628
+ " global imbalance_report1\n",
 
629
  "\n",
630
  " if method == 'smote': \n",
631
  " from imblearn.over_sampling import SMOTE\n",
632
  " sm = SMOTE(random_state=42)\n",
633
  " X_train_res, y_train_res = sm.fit_resample(X_train, y_train)\n",
634
+ " imbalance_report0 = 'Shape of the training set after oversampling with SMOTE: ', X_train_res.shape\n",
635
+ " imbalance_report1 = 'Value counts of the target variable after oversampling with SMOTE: \\n', y_train_res.value_counts()\n",
636
  " imbalance_var = 'smote'\n",
637
  " return X_train_res, y_train_res\n",
638
  " \n",
 
640
  " from imblearn.under_sampling import RandomUnderSampler\n",
641
  " rus = RandomUnderSampler(random_state=42)\n",
642
  " X_train_res, y_train_res = rus.fit_resample(X_train, y_train)\n",
643
+ " imbalance_report0 = 'Shape of the training set after undersampling with RandomUnderSampler: ', X_train_res.shape\n",
644
+ " imbalance_report1 = 'Value counts of the target variable after undersampling with RandomUnderSampler: \\n', y_train_res.value_counts()\n",
645
  " imbalance_var = 'undersampling'\n",
646
  " return X_train_res, y_train_res\n",
647
  " \n",
 
649
  " from imblearn.over_sampling import RandomOverSampler\n",
650
  " ros = RandomOverSampler(random_state=42)\n",
651
  " X_train_res, y_train_res = ros.fit_resample(X_train, y_train)\n",
652
+ " imbalance_report0 = 'Shape of the training set after oversampling with RandomOverSampler: ', X_train_res.shape\n",
653
+ " imbalance_report1 = 'Value counts of the target variable after oversampling with RandomOverSampler: \\n', y_train_res.value_counts()\n",
654
  " imbalance_var = 'rose'\n",
655
  " return X_train_res, y_train_res\n",
656
  " \n",
 
658
  " if method == 'none':\n",
659
  " X_train_res = X_train\n",
660
  " y_train_res = y_train\n",
661
+ " imbalance_report0 = 'Shape of the training set after no resampling: ', X_train_res.shape\n",
662
+ " imbalance_report1 = 'Value counts of the target variable after no resampling: \\n', y_train_res.value_counts()\n",
663
  " imbalance_var = 'none'\n",
664
  " return X_train_res, y_train_res\n",
665
  " \n",
 
684
  },
685
  {
686
  "cell_type": "code",
687
+ "execution_count": 367,
688
  "metadata": {},
689
  "outputs": [],
690
  "source": [
 
757
  },
758
  {
759
  "cell_type": "code",
760
+ "execution_count": 368,
761
  "metadata": {},
762
  "outputs": [],
763
  "source": [
 
779
  },
780
  {
781
  "cell_type": "code",
782
+ "execution_count": 369,
783
  "metadata": {},
784
  "outputs": [],
785
  "source": [
 
882
  },
883
  {
884
  "cell_type": "code",
885
+ "execution_count": 370,
886
  "metadata": {},
887
  "outputs": [
888
  {
889
  "data": {
890
  "application/mercury+json": {
891
+ "code_uid": "Text.0.40.15.8-rand4a43baec",
892
  "disabled": false,
893
  "hidden": false,
894
  "label": "Missing Value Threeshold",
895
+ "model_id": "b2736e53364e4041b6ce10b9e1e1f7d8",
896
  "rows": 1,
897
  "url_key": "",
898
  "value": "50",
899
  "widget": "Text"
900
  },
901
  "application/vnd.jupyter.widget-view+json": {
902
+ "model_id": "b2736e53364e4041b6ce10b9e1e1f7d8",
903
  "version_major": 2,
904
  "version_minor": 0
905
  },
 
913
  {
914
  "data": {
915
  "application/mercury+json": {
916
+ "code_uid": "Text.0.40.15.11-rand6f838484",
917
  "disabled": false,
918
  "hidden": false,
919
  "label": "Variance Threshold",
920
+ "model_id": "97419c4a49954b8490aa311870d010b9",
921
  "rows": 1,
922
  "url_key": "",
923
  "value": "0.05",
924
  "widget": "Text"
925
  },
926
  "application/vnd.jupyter.widget-view+json": {
927
+ "model_id": "97419c4a49954b8490aa311870d010b9",
928
  "version_major": 2,
929
  "version_minor": 0
930
  },
 
938
  {
939
  "data": {
940
  "application/mercury+json": {
941
+ "code_uid": "Text.0.40.15.14-rand6243cbfa",
942
  "disabled": false,
943
  "hidden": false,
944
  "label": "Correlation Threshold",
945
+ "model_id": "e9f072dfb6a241bca69f960fa0aa06a1",
946
  "rows": 1,
947
  "url_key": "",
948
  "value": "0.95",
949
  "widget": "Text"
950
  },
951
  "application/vnd.jupyter.widget-view+json": {
952
+ "model_id": "e9f072dfb6a241bca69f960fa0aa06a1",
953
  "version_major": 2,
954
  "version_minor": 0
955
  },
 
969
  4,
970
  5
971
  ],
972
+ "code_uid": "Select.0.40.16.18-randa184b437",
973
  "disabled": false,
974
  "hidden": false,
975
  "label": "Outlier Removal Threshold",
976
+ "model_id": "0be493385a154210b3c7685a3bd1074f",
977
  "url_key": "",
978
  "value": 5,
979
  "widget": "Select"
980
  },
981
  "application/vnd.jupyter.widget-view+json": {
982
+ "model_id": "0be493385a154210b3c7685a3bd1074f",
983
  "version_major": 2,
984
  "version_minor": 0
985
  },
 
999
  "minmax",
1000
  "robust"
1001
  ],
1002
+ "code_uid": "Select.0.40.16.25-rand163d8992",
1003
  "disabled": false,
1004
  "hidden": false,
1005
  "label": "Scaling Variables",
1006
+ "model_id": "985eab871677416f9c14ea528b0fd561",
1007
  "url_key": "",
1008
  "value": "standard",
1009
  "widget": "Select"
1010
  },
1011
  "application/vnd.jupyter.widget-view+json": {
1012
+ "model_id": "985eab871677416f9c14ea528b0fd561",
1013
  "version_major": 2,
1014
  "version_minor": 0
1015
  },
 
1029
  "knn",
1030
  "most_frequent"
1031
  ],
1032
+ "code_uid": "Select.0.40.16.29-randb76d7c1d",
1033
  "disabled": false,
1034
  "hidden": false,
1035
  "label": "Imputation Methods",
1036
+ "model_id": "eef6b42e02914c98b7e7ed8d0a18df98",
1037
  "url_key": "",
1038
  "value": "median",
1039
  "widget": "Select"
1040
  },
1041
  "application/vnd.jupyter.widget-view+json": {
1042
+ "model_id": "eef6b42e02914c98b7e7ed8d0a18df98",
1043
  "version_major": 2,
1044
  "version_minor": 0
1045
  },
 
1060
  "pca",
1061
  "boruta"
1062
  ],
1063
+ "code_uid": "Select.0.40.16.34-rand254bd909",
1064
  "disabled": false,
1065
  "hidden": false,
1066
  "label": "Feature Selection",
1067
+ "model_id": "f4fc58b330a24bfe8699e0602178b0e1",
1068
  "url_key": "",
1069
  "value": "lasso",
1070
  "widget": "Select"
1071
  },
1072
  "application/vnd.jupyter.widget-view+json": {
1073
+ "model_id": "f4fc58b330a24bfe8699e0602178b0e1",
1074
  "version_major": 2,
1075
  "version_minor": 0
1076
  },
 
1090
  "undersampling",
1091
  "rose"
1092
  ],
1093
+ "code_uid": "Select.0.40.16.38-rand75e4d938",
1094
  "disabled": false,
1095
  "hidden": false,
1096
  "label": "Imbalance Treatment",
1097
+ "model_id": "965a81a69265473a830f8eec5e8ba2df",
1098
  "url_key": "",
1099
  "value": "smote",
1100
  "widget": "Select"
1101
  },
1102
  "application/vnd.jupyter.widget-view+json": {
1103
+ "model_id": "965a81a69265473a830f8eec5e8ba2df",
1104
  "version_major": 2,
1105
  "version_minor": 0
1106
  },
 
1123
  "decision_tree",
1124
  "xgboost"
1125
  ],
1126
+ "code_uid": "Select.0.40.16.42-rand1bbd78ac",
1127
  "disabled": false,
1128
  "hidden": false,
1129
  "label": "Model Selection",
1130
+ "model_id": "0d1b1477e14b44b99d00dc89dffb70cb",
1131
  "url_key": "",
1132
  "value": "random_forest",
1133
  "widget": "Select"
1134
  },
1135
  "application/vnd.jupyter.widget-view+json": {
1136
+ "model_id": "0d1b1477e14b44b99d00dc89dffb70cb",
1137
  "version_major": 2,
1138
  "version_minor": 0
1139
  },
 
1148
  "name": "stdout",
1149
  "output_type": "stream",
1150
  "text": [
1151
+ "<class 'list'>\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1152
  ]
1153
  }
1154
  ],
 
1245
  },
1246
  {
1247
  "cell_type": "code",
1248
+ "execution_count": 371,
1249
  "metadata": {},
1250
  "outputs": [
1251
  {
 
1254
  "text": [
1255
  "--------------------------------------------------\n"
1256
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1257
  }
1258
  ],
1259
  "source": [
 
1263
  {
1264
  "attachments": {},
1265
  "cell_type": "markdown",
1266
+ "metadata": {},
 
 
 
 
1267
  "source": [
1268
  "#### **Confusion Matrix**"
1269
  ]
1270
  },
1271
  {
1272
  "cell_type": "code",
1273
+ "execution_count": 372,
1274
+ "metadata": {},
 
 
 
 
1275
  "outputs": [
1276
  {
1277
  "data": {
 
1345
  "\n",
1346
  "display(evaluation_score_output[['Accuracy', 'Precision', 'Recall', 'F1-score']])"
1347
  ]
1348
+ },
1349
+ {
1350
+ "attachments": {},
1351
+ "cell_type": "markdown",
1352
+ "metadata": {},
1353
+ "source": [
1354
+ "### **Transformations Report**"
1355
+ ]
1356
+ },
1357
+ {
1358
+ "cell_type": "code",
1359
+ "execution_count": 373,
1360
+ "metadata": {},
1361
+ "outputs": [
1362
+ {
1363
+ "name": "stdout",
1364
+ "output_type": "stream",
1365
+ "text": [
1366
+ "------------------------------------------\n",
1367
+ "FEATURE REMOVAL\n",
1368
+ "('Shape of the dataframe is:', (1175, 590))\n",
1369
+ "('the number of columns dropped due to duplications is: ', 104)\n",
1370
+ "('the number of columns dropped due to missing values is: ', 28)\n",
1371
+ "('the number of columns dropped due to low variance is: ', 189)\n",
1372
+ "('the number of columns dropped due to high correlation is: ', 90)\n",
1373
+ "('Total number of columns to be dropped is: ', 411)\n",
1374
+ "('New shape of the dataframe is: ', (1175, 179))\n",
1375
+ "------------------------------------------\n",
1376
+ "OUTLIER REMOVAL\n",
1377
+ "('The z-score threshold is:', 5)\n",
1378
+ "('The number of outliers removed from the dataset is:', 163)\n",
1379
+ "------------------------------------------\n",
1380
+ "SCALING\n",
1381
+ "The dataframe has been scaled using the standard scaling model\n",
1382
+ "------------------------------------------\n",
1383
+ "IMPUTATION\n",
1384
+ "('Number of missing values before imputation: ', 1196)\n",
1385
+ "median imputation has been applied\n",
1386
+ "('Number of missing values after imputation: ', 0)\n",
1387
+ "------------------------------------------\n",
1388
+ "FEATURE SELECTION\n",
1389
+ "('Selected method is: ', 'lasso')\n",
1390
+ "('Shape of the training set after feature selection with LassoCV: ', (1175, 14))\n",
1391
+ "------------------------------------------\n",
1392
+ "IMBALANCE TREATMENT\n",
1393
+ "('Shape of the training set after oversampling with SMOTE: ', (2194, 14))\n",
1394
+ "('Value counts of the target variable after oversampling with SMOTE: \\n', pass/fail\n",
1395
+ "0 1097\n",
1396
+ "1 1097\n",
1397
+ "dtype: int64)\n"
1398
+ ]
1399
+ }
1400
+ ],
1401
+ "source": [
1402
+ "print('------------------------------------------')\n",
1403
+ "print('FEATURE REMOVAL')\n",
1404
+ "print(feature_removal_report0)\n",
1405
+ "print(feature_removal_report1)\n",
1406
+ "print(feature_removal_report2)\n",
1407
+ "print(feature_removal_report3)\n",
1408
+ "print(feature_removal_report4)\n",
1409
+ "print(feature_removal_report5)\n",
1410
+ "print(feature_removal_report6)\n",
1411
+ "print('------------------------------------------')\n",
1412
+ "print('OUTLIER REMOVAL')\n",
1413
+ "print(outlier_removal_report0)\n",
1414
+ "print(outlier_removal_report1)\n",
1415
+ "print('------------------------------------------')\n",
1416
+ "print('SCALING')\n",
1417
+ "print(scaling_report0)\n",
1418
+ "print('------------------------------------------')\n",
1419
+ "print('IMPUTATION')\n",
1420
+ "print(imputation_report0)\n",
1421
+ "print(imputation_report1)\n",
1422
+ "print(imputation_report2)\n",
1423
+ "print('------------------------------------------')\n",
1424
+ "print('FEATURE SELECTION')\n",
1425
+ "print(feature_selection_report0)\n",
1426
+ "print(feature_selection_report1)\n",
1427
+ "print('------------------------------------------')\n",
1428
+ "print('IMBALANCE TREATMENT')\n",
1429
+ "print(imbalance_report0)\n",
1430
+ "print(imbalance_report1)"
1431
+ ]
1432
  }
1433
  ],
1434
  "metadata": {