Spaces:

erjonb
/

secom

Sleeping

App Files Files Community

erjonb commited on May 25, 2023

Commit

51f4ca9

1 Parent(s): e1dc96a

Upload P2 - Secom Notebook2 - Mercury.ipynb

Browse files

Files changed (1) hide show

P2 - Secom Notebook2 - Mercury.ipynb +202 -321

P2 - Secom Notebook2 - Mercury.ipynb CHANGED Viewed

@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -53,14 +53,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/mercury+json": {
        "allow_download": true,
-       "code_uid": "App.0.40.24.1-rand8c10e2d9",
        "continuous_update": false,
        "description": "Recumpute everything dynamically",
        "full_screen": true,
@@ -92,7 +92,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -129,24 +129,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/mercury+json": {
-       "code_uid": "Text.0.40.15.11-rand39f89858",
        "disabled": false,
        "hidden": false,
        "label": "Test Size Ratio",
-       "model_id": "271115d337014695a05d7e83307b4cc4",
        "rows": 1,
        "url_key": "",
        "value": "0.25",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "271115d337014695a05d7e83307b4cc4",
        "version_major": 2,
        "version_minor": 0
       },
@@ -160,18 +160,18 @@
     {
      "data": {
       "application/mercury+json": {
-       "code_uid": "Text.0.40.15.14-randf159337c",
        "disabled": false,
        "hidden": false,
        "label": "Random State Integer",
-       "model_id": "87a237754fa24e11a17700de955552a8",
        "rows": 1,
        "url_key": "",
        "value": "13",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "87a237754fa24e11a17700de955552a8",
        "version_major": 2,
        "version_minor": 0
       },
@@ -220,31 +220,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
    "metadata": {},
    "outputs": [],
    "source": [
     "def columns_to_drop(df,drop_duplicates='yes', missing_values_threshold=100, variance_threshold=0, \n",
     "                   correlation_threshold=1.1):\n",
     "   \n",
-    "   print('------------------------------------------')\n",
-    "   print('FEATURE REMOVAL')\n",
     "   \n",
-    "   print('Shape of the dataframe is: ', df.shape)\n",
     "\n",
     "    # Drop duplicated columns\n",
     "   if drop_duplicates == 'yes':\n",
     "      new_column_names = df.columns\n",
     "      df = df.T.drop_duplicates().T\n",
-    "      print('the number of columns dropped due to duplications is: ', len(new_column_names) - len(df.columns))\n",
     "      drop_duplicated = list(set(new_column_names) - set(df.columns))\n",
     "\n",
     "   elif drop_duplicates == 'no':\n",
     "      df = df.T.T\n",
-    "      print('No columns were dropped due to duplications')   \n",
     "\n",
     "   # Print the percentage of columns in df with missing values more than or equal to threshold\n",
-    "   print('the number of columns dropped due to missing values is: ', len(df.isnull().mean()[df.isnull().mean() > missing_values_threshold/100].index))\n",
     "      \n",
     "   # Print into a list the columns to be dropped due to missing values\n",
     "   drop_missing = list(df.isnull().mean()[df.isnull().mean() > missing_values_threshold/100].index)\n",
@@ -253,7 +259,7 @@
     "   df.drop(drop_missing, axis=1, inplace=True)\n",
     "    \n",
     "   # Print the number of columns in df with variance less than threshold\n",
-    "   print('the number of columns dropped due to low variance is: ', len(df.var()[df.var() <= variance_threshold].index))\n",
     "\n",
     "   # Print into a list the columns to be dropped due to low variance\n",
     "   drop_variance = list(df.var()[df.var() <= variance_threshold].index)\n",
@@ -267,7 +273,7 @@
     "   corr_matrix = df.corr().abs().round(4)\n",
     "   upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n",
     "   to_drop = [column for column in upper.columns if any(upper[column] >= correlation_threshold)]\n",
-    "   print('the number of columns dropped due to high correlation is: ', len(to_drop))\n",
     "\n",
     "   # Print into a list the columns to be dropped due to high correlation\n",
     "   drop_correlation = [column for column in upper.columns if any(upper[column] >= correlation_threshold)]\n",
@@ -281,8 +287,8 @@
     "   elif drop_duplicates =='no':\n",
     "      dropped = (drop_missing+drop_variance+drop_correlation)\n",
     "   \n",
-    "   print('Total number of columns to be dropped is: ', len(dropped))\n",
-    "   print('New shape of the dataframe is: ', df.shape)\n",
     "\n",
     "   global drop_duplicates_var\n",
     "   drop_duplicates_var = drop_duplicates\n",
@@ -314,24 +320,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
    "metadata": {},
    "outputs": [],
    "source": [
     "def outlier_removal(z_df, z_threshold=4):\n",
     "    \n",
     "    global outlier_var\n",
     "\n",
-    "    print('------------------------------------------')\n",
-    "    print('OUTLIER REMOVAL')\n",
     "\n",
     "    if z_threshold == 'none':\n",
-    "        print('No outliers were removed')\n",
     "        outlier_var = 'none'\n",
     "        return z_df\n",
     "        \n",
     "    else:\n",
-    "        print('The z-score threshold is:', z_threshold)\n",
     "\n",
     "        z_df_copy = z_df.copy()\n",
     "\n",
@@ -342,11 +348,10 @@
     "        z_df_copy[outliers_mask] = np.nan\n",
     "\n",
     "        outliers_count = np.count_nonzero(outliers_mask)\n",
-    "        print('The number of outliers removed from the dataset is:', outliers_count)\n",
     "\n",
     "        outlier_var = z_threshold\n",
     "\n",
-    "    print(type(z_df_copy))\n",
     "    return z_df_copy"
    ]
   },
@@ -364,7 +369,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 91,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -373,9 +378,7 @@
     "def scale_dataframe(scale_model,df_fit, df_transform):\n",
     "    \n",
     "    global scale_model_var\n",
-    "\n",
-    "    print('------------------------------------------')\n",
-    "    print('SCALING THE DATAFRAME')\n",
     "\n",
     "    if scale_model == 'robust':\n",
     "        from sklearn.preprocessing import RobustScaler\n",
@@ -383,7 +386,7 @@
     "        scaler.fit(df_fit)\n",
     "        df_scaled = scaler.transform(df_transform)\n",
     "        df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
-    "        print('The dataframe has been scaled using the robust scaling model')\n",
     "        scale_model_var = 'robust'\n",
     "        return df_scaled\n",
     "    \n",
@@ -393,7 +396,7 @@
     "        scaler.fit(df_fit)\n",
     "        df_scaled = scaler.transform(df_transform)\n",
     "        df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
-    "        print('The dataframe has been scaled using the standard scaling model')\n",
     "        scale_model_var = 'standard'\n",
     "        return df_scaled\n",
     "    \n",
@@ -403,7 +406,7 @@
     "        scaler.fit(df_fit)\n",
     "        df_scaled = scaler.transform(df_transform)\n",
     "        df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
-    "        print('The dataframe has been scaled using the normal scaling model')\n",
     "        scale_model_var = 'normal'\n",
     "        return df_scaled\n",
     "    \n",
@@ -413,12 +416,12 @@
     "        scaler.fit(df_fit)\n",
     "        df_scaled = scaler.transform(df_transform)\n",
     "        df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
-    "        print('The dataframe has been scaled using the minmax scaling model')\n",
     "        scale_model_var = 'minmax'\n",
     "        return df_scaled\n",
     "    \n",
     "    elif scale_model == 'none':\n",
-    "        print('The dataframe has not been scaled')\n",
     "        scale_model_var = 'none'\n",
     "        return df_transform\n",
     "    \n",
@@ -441,7 +444,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 92,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -449,11 +452,14 @@
     "\n",
     "def impute_missing_values(imputation, df_fit, df_transform, n_neighbors=5):\n",
     "\n",
-    "    print('------------------------------------------')\n",
-    "    print('IMPUTATION PROCESS')\n",
-    "    print('Number of missing values before imputation: ', df_transform.isnull().sum().sum())\n",
-    "\n",
     "    global imputation_var\n",
     "\n",
     "    if imputation == 'knn':\n",
     "\n",
@@ -462,8 +468,8 @@
     "        imputer.fit(df_fit)\n",
     "        df_imputed = imputer.transform(df_transform)\n",
     "        df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
-    "        print('knn imputation has been applied')        \n",
-    "        print('Number of missing values after imputation: ', df_imputed.isnull().sum().sum())\n",
     "        imputation_var = 'knn'\n",
     "        return df_imputed\n",
     "    \n",
@@ -474,8 +480,8 @@
     "        imputer.fit(df_fit)\n",
     "        df_imputed = imputer.transform(df_transform)\n",
     "        df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
-    "        print('mean imputation has been applied')\n",
-    "        print('Number of missing values after imputation: ', df_imputed.isnull().sum().sum())\n",
     "        imputation_var = 'mean'\n",
     "        return df_imputed\n",
     "    \n",
@@ -486,8 +492,8 @@
     "        imputer.fit(df_fit)\n",
     "        df_imputed = imputer.transform(df_transform)\n",
     "        df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
-    "        print('median imputation has been applied')\n",
-    "        print('Number of missing values after imputation: ', df_imputed.isnull().sum().sum())\n",
     "        imputation_var = 'median'\n",
     "        return df_imputed\n",
     "    \n",
@@ -498,8 +504,8 @@
     "        imputer.fit(df_fit)\n",
     "        df_imputed = imputer.transform(df_transform)\n",
     "        df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
-    "        print('most frequent imputation has been applied')\n",
-    "        print('Number of missing values after imputation: ', df_imputed.isnull().sum().sum())\n",
     "        imputation_var = 'most_frequent'\n",
     "        return df_imputed\n",
     "    \n",
@@ -523,7 +529,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 93,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -531,14 +537,15 @@
     "\n",
     "    global feature_selection_var\n",
     "    global selected_features\n",
     "\n",
-    "    print('------------------------------------------')\n",
-    "    print('FEATURE SELECTION')\n",
     "\n",
     "    # if method is boruta, run boruta feature selection and return the selected features and the training set with only the selected features\n",
     "\n",
     "    if method == 'boruta':\n",
-    "        print('Selected method is: ', method)\n",
     "        from boruta import BorutaPy\n",
     "        from sklearn.ensemble import RandomForestClassifier\n",
     "        rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)\n",
@@ -547,48 +554,48 @@
     "        selected_feature_indices = boruta_selector.support_\n",
     "        selected_columns = X_train.columns[selected_feature_indices]\n",
     "        X_train_filtered = X_train.iloc[:, selected_feature_indices]\n",
-    "        print('Shape of the training set after feature selection with Boruta: ', X_train_filtered.shape)\n",
     "        return X_train_filtered, selected_columns\n",
     "    \n",
     "    if method == 'none':\n",
-    "        print('Selected method is: ', method)\n",
     "        X_train_filtered = X_train\n",
-    "        print('Shape of the training set after no feature selection: ', X_train_filtered.shape)\n",
     "        feature_selection_var = 'none'\n",
     "        selected_features = X_train_filtered.columns\n",
     "        return X_train_filtered, selected_features        \n",
     "    \n",
     "    if method == 'lasso':\n",
-    "        print('Selected method is: ', method)\n",
     "        from sklearn.linear_model import LassoCV\n",
     "        from sklearn.feature_selection import SelectFromModel\n",
     "        lasso = LassoCV().fit(X_train, y_train)\n",
     "        model = SelectFromModel(lasso, prefit=True)\n",
     "        X_train_filtered = model.transform(X_train)\n",
     "        selected_features = X_train.columns[model.get_support()]\n",
-    "        print('Shape of the training set after feature selection with LassoCV: ', X_train_filtered.shape)\n",
     "        feature_selection_var = 'lasso'\n",
     "        return X_train_filtered, selected_features\n",
     "    \n",
     "    if method == 'pca':\n",
-    "        print('Selected method is: ', method)\n",
     "        from sklearn.decomposition import PCA\n",
     "        pca = PCA(n_components=15)\n",
     "        X_train_pca = pca.fit_transform(X_train)\n",
     "        selected_features = X_train.columns[pca.explained_variance_ratio_.argsort()[::-1]][:15]\n",
-    "        print('Shape of the training set after feature selection with PCA: ', X_train_pca.shape)\n",
     "        feature_selection_var = 'pca'\n",
     "        return X_train_pca, selected_features\n",
     "    \n",
     "    if method == 'rfe':\n",
-    "        print('Selected method is: ', method)\n",
     "        from sklearn.feature_selection import RFE\n",
     "        from sklearn.ensemble import RandomForestClassifier\n",
     "        rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, n_jobs=-1), n_features_to_select=15, step=10, verbose=0)\n",
     "        rfe_selector.fit(X_train, y_train)\n",
     "        selected_features = X_train.columns[rfe_selector.support_]\n",
     "        X_train_filtered = X_train.iloc[:, rfe_selector.support_]\n",
-    "        print('Shape of the training set after feature selection with RFE: ', X_train_filtered.shape)\n",
     "        feature_selection_var = 'rfe'\n",
     "        return X_train_filtered, selected_features\n",
     "    "
@@ -608,7 +615,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 94,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -617,16 +624,15 @@
     "def imbalance_treatment(method, X_train, y_train):\n",
     "\n",
     "    global imbalance_var\n",
-    "\n",
-    "    print('------------------------------------------')\n",
-    "    print('IMBALANCE TREATMENT')\n",
     "\n",
     "    if method == 'smote':        \n",
     "        from imblearn.over_sampling import SMOTE\n",
     "        sm = SMOTE(random_state=42)\n",
     "        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)\n",
-    "        print('Shape of the training set after oversampling with SMOTE: ', X_train_res.shape)\n",
-    "        print('Value counts of the target variable after oversampling with SMOTE: \\n', y_train_res.value_counts())\n",
     "        imbalance_var = 'smote'\n",
     "        return X_train_res, y_train_res\n",
     "    \n",
@@ -634,8 +640,8 @@
     "        from imblearn.under_sampling import RandomUnderSampler\n",
     "        rus = RandomUnderSampler(random_state=42)\n",
     "        X_train_res, y_train_res = rus.fit_resample(X_train, y_train)\n",
-    "        print('Shape of the training set after undersampling with RandomUnderSampler: ', X_train_res.shape)\n",
-    "        print('Value counts of the target variable after undersampling with RandomUnderSampler: \\n', y_train_res.value_counts())\n",
     "        imbalance_var = 'undersampling'\n",
     "        return X_train_res, y_train_res\n",
     "    \n",
@@ -643,8 +649,8 @@
     "        from imblearn.over_sampling import RandomOverSampler\n",
     "        ros = RandomOverSampler(random_state=42)\n",
     "        X_train_res, y_train_res = ros.fit_resample(X_train, y_train)\n",
-    "        print('Shape of the training set after oversampling with RandomOverSampler: ', X_train_res.shape)\n",
-    "        print('Value counts of the target variable after oversampling with RandomOverSampler: \\n', y_train_res.value_counts())\n",
     "        imbalance_var = 'rose'\n",
     "        return X_train_res, y_train_res\n",
     "    \n",
@@ -652,8 +658,8 @@
     "    if method == 'none':\n",
     "        X_train_res = X_train\n",
     "        y_train_res = y_train\n",
-    "        print('Shape of the training set after no resampling: ', X_train_res.shape)\n",
-    "        print('Value counts of the target variable after no resampling: \\n', y_train_res.value_counts())\n",
     "        imbalance_var = 'none'\n",
     "        return X_train_res, y_train_res\n",
     "    \n",
@@ -678,7 +684,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 95,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -751,7 +757,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 96,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -773,7 +779,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 101,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -876,24 +882,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/mercury+json": {
-       "code_uid": "Text.0.40.15.8-rand3ea159f1",
        "disabled": false,
        "hidden": false,
        "label": "Missing Value Threeshold",
-       "model_id": "1f155017a0e64a71ba2f1a737d93c61d",
        "rows": 1,
        "url_key": "",
        "value": "50",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1f155017a0e64a71ba2f1a737d93c61d",
        "version_major": 2,
        "version_minor": 0
       },
@@ -907,18 +913,18 @@
     {
      "data": {
       "application/mercury+json": {
-       "code_uid": "Text.0.40.15.11-rand7772c265",
        "disabled": false,
        "hidden": false,
        "label": "Variance Threshold",
-       "model_id": "6ae03558245f4658abc409261e88e273",
        "rows": 1,
        "url_key": "",
        "value": "0.05",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6ae03558245f4658abc409261e88e273",
        "version_major": 2,
        "version_minor": 0
       },
@@ -932,18 +938,18 @@
     {
      "data": {
       "application/mercury+json": {
-       "code_uid": "Text.0.40.15.14-rand52a68a07",
        "disabled": false,
        "hidden": false,
        "label": "Correlation Threshold",
-       "model_id": "325fdc5cafb046c89be6440a2a0e855c",
        "rows": 1,
        "url_key": "",
        "value": "0.95",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "325fdc5cafb046c89be6440a2a0e855c",
        "version_major": 2,
        "version_minor": 0
       },
@@ -963,17 +969,17 @@
         4,
         5
        ],
-       "code_uid": "Select.0.40.16.18-randd219fc2a",
        "disabled": false,
        "hidden": false,
        "label": "Outlier Removal Threshold",
-       "model_id": "96591acea02b43f9a366ddfdfff1dfb5",
        "url_key": "",
        "value": 5,
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "96591acea02b43f9a366ddfdfff1dfb5",
        "version_major": 2,
        "version_minor": 0
       },
@@ -993,17 +999,17 @@
         "minmax",
         "robust"
        ],
-       "code_uid": "Select.0.40.16.25-rand7528f0a1",
        "disabled": false,
        "hidden": false,
        "label": "Scaling Variables",
-       "model_id": "8fc6da55b85c4993bf420c86d2a23a2d",
        "url_key": "",
        "value": "standard",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8fc6da55b85c4993bf420c86d2a23a2d",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1023,17 +1029,17 @@
         "knn",
         "most_frequent"
        ],
-       "code_uid": "Select.0.40.16.29-randb1225c9d",
        "disabled": false,
        "hidden": false,
        "label": "Imputation Methods",
-       "model_id": "193450229f4543079d7a53267d8c1fe1",
        "url_key": "",
        "value": "median",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "193450229f4543079d7a53267d8c1fe1",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1054,17 +1060,17 @@
         "pca",
         "boruta"
        ],
-       "code_uid": "Select.0.40.16.34-rand9ae51452",
        "disabled": false,
        "hidden": false,
        "label": "Feature Selection",
-       "model_id": "e556e5025dc14e9e867125680d35025f",
        "url_key": "",
        "value": "lasso",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e556e5025dc14e9e867125680d35025f",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1084,17 +1090,17 @@
         "undersampling",
         "rose"
        ],
-       "code_uid": "Select.0.40.16.38-rand84f919f9",
        "disabled": false,
        "hidden": false,
        "label": "Imbalance Treatment",
-       "model_id": "29560b71dbf84b45a2a487c92c077ad4",
        "url_key": "",
        "value": "smote",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "29560b71dbf84b45a2a487c92c077ad4",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1117,17 +1123,17 @@
         "decision_tree",
         "xgboost"
        ],
-       "code_uid": "Select.0.40.16.42-rand98b2dc54",
        "disabled": false,
        "hidden": false,
        "label": "Model Selection",
-       "model_id": "d90f6d906955444e9abf735d40442d91",
        "url_key": "",
        "value": "random_forest",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d90f6d906955444e9abf735d40442d91",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1142,49 +1148,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "------------------------------------------\n",
-      "FEATURE REMOVAL\n",
-      "Shape of the dataframe is:  (1175, 590)\n",
-      "the number of columns dropped due to duplications is:  104\n",
-      "the number of columns dropped due to missing values is:  28\n",
-      "the number of columns dropped due to low variance is:  189\n",
-      "the number of columns dropped due to high correlation is:  90\n",
-      "Total number of columns to be dropped is:  411\n",
-      "New shape of the dataframe is:  (1175, 179)\n",
-      "<class 'list'>\n",
-      "------------------------------------------\n",
-      "OUTLIER REMOVAL\n",
-      "The z-score threshold is: 5\n",
-      "The number of outliers removed from the dataset is: 163\n",
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "------------------------------------------\n",
-      "SCALING THE DATAFRAME\n",
-      "The dataframe has been scaled using the standard scaling model\n",
-      "------------------------------------------\n",
-      "SCALING THE DATAFRAME\n",
-      "The dataframe has been scaled using the standard scaling model\n",
-      "------------------------------------------\n",
-      "IMPUTATION PROCESS\n",
-      "Number of missing values before imputation:  3380\n",
-      "median imputation has been applied\n",
-      "Number of missing values after imputation:  0\n",
-      "------------------------------------------\n",
-      "IMPUTATION PROCESS\n",
-      "Number of missing values before imputation:  1196\n",
-      "median imputation has been applied\n",
-      "Number of missing values after imputation:  0\n",
-      "------------------------------------------\n",
-      "FEATURE SELECTION\n",
-      "Selected method is:  lasso\n",
-      "Shape of the training set after feature selection with LassoCV:  (1175, 14)\n",
-      "------------------------------------------\n",
-      "IMBALANCE TREATMENT\n",
-      "Shape of the training set after oversampling with SMOTE:  (2194, 14)\n",
-      "Value counts of the target variable after oversampling with SMOTE: \n",
-      " pass/fail\n",
-      "0            1097\n",
-      "1            1097\n",
-      "dtype: int64\n"
      ]
     }
    ],
@@ -1281,7 +1245,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 113,
    "metadata": {},
    "outputs": [
     {
@@ -1290,165 +1254,6 @@
      "text": [
       "--------------------------------------------------\n"
      ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Model</th>\n",
-       "      <th>True Negatives</th>\n",
-       "      <th>False Positives</th>\n",
-       "      <th>False Negatives</th>\n",
-       "      <th>True Positives</th>\n",
-       "      <th>drop duplicates</th>\n",
-       "      <th>missing values th</th>\n",
-       "      <th>variance th</th>\n",
-       "      <th>correlation th</th>\n",
-       "      <th>outlier removal th</th>\n",
-       "      <th>scaling method</th>\n",
-       "      <th>imputation method</th>\n",
-       "      <th>feature selection</th>\n",
-       "      <th>imbalance treatment</th>\n",
-       "      <th>model_variables</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>random_forest</td>\n",
-       "      <td>344</td>\n",
-       "      <td>22</td>\n",
-       "      <td>22</td>\n",
-       "      <td>4</td>\n",
-       "      <td>yes</td>\n",
-       "      <td>50</td>\n",
-       "      <td>0.05</td>\n",
-       "      <td>0.95</td>\n",
-       "      <td>5</td>\n",
-       "      <td>standard</td>\n",
-       "      <td>median</td>\n",
-       "      <td>lasso</td>\n",
-       "      <td>smote</td>\n",
-       "      <td>yes_50_0.05_0.95_5_standard_median_lasso_smote</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "           Model  True Negatives  False Positives  False Negatives  \\\n",
-       "0  random_forest             344               22               22   \n",
-       "\n",
-       "   True Positives drop duplicates  missing values th  variance th  \\\n",
-       "0               4             yes                 50         0.05   \n",
-       "\n",
-       "   correlation th  outlier removal th scaling method imputation method  \\\n",
-       "0            0.95                   5       standard            median   \n",
-       "\n",
-       "  feature selection imbalance treatment  \\\n",
-       "0             lasso               smote   \n",
-       "\n",
-       "                                  model_variables  \n",
-       "0  yes_50_0.05_0.95_5_standard_median_lasso_smote  "
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Model</th>\n",
-       "      <th>Accuracy</th>\n",
-       "      <th>Precision</th>\n",
-       "      <th>Recall</th>\n",
-       "      <th>F1-score</th>\n",
-       "      <th>drop duplicates</th>\n",
-       "      <th>missing values th</th>\n",
-       "      <th>variance th</th>\n",
-       "      <th>correlation th</th>\n",
-       "      <th>outlier removal th</th>\n",
-       "      <th>scaling method</th>\n",
-       "      <th>imputation method</th>\n",
-       "      <th>feature selection</th>\n",
-       "      <th>imbalance treatment</th>\n",
-       "      <th>model_variables</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>random_forest</td>\n",
-       "      <td>0.89</td>\n",
-       "      <td>0.15</td>\n",
-       "      <td>0.15</td>\n",
-       "      <td>0.15</td>\n",
-       "      <td>yes</td>\n",
-       "      <td>50</td>\n",
-       "      <td>0.05</td>\n",
-       "      <td>0.95</td>\n",
-       "      <td>5</td>\n",
-       "      <td>standard</td>\n",
-       "      <td>median</td>\n",
-       "      <td>lasso</td>\n",
-       "      <td>smote</td>\n",
-       "      <td>yes_50_0.05_0.95_5_standard_median_lasso_smote</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "           Model  Accuracy  Precision  Recall  F1-score drop duplicates  \\\n",
-       "0  random_forest      0.89       0.15    0.15      0.15             yes   \n",
-       "\n",
-       "   missing values th  variance th  correlation th  outlier removal th  \\\n",
-       "0                 50         0.05            0.95                   5   \n",
-       "\n",
-       "  scaling method imputation method feature selection imbalance treatment  \\\n",
-       "0       standard            median             lasso               smote   \n",
-       "\n",
-       "                                  model_variables  \n",
-       "0  yes_50_0.05_0.95_5_standard_median_lasso_smote  "
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     }
    ],
    "source": [
@@ -1458,23 +1263,15 @@
   {
    "attachments": {},
    "cell_type": "markdown",
-   "metadata": {
-    "slideshow": {
-     "slide_type": "skip"
-    }
-   },
    "source": [
     "#### **Confusion Matrix**"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 125,
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
    "outputs": [
     {
      "data": {
@@ -1548,6 +1345,90 @@
     "\n",
     "display(evaluation_score_output[['Accuracy', 'Precision', 'Recall', 'F1-score']])"
    ]
   }
  ],
  "metadata": {

   },
   {
    "cell_type": "code",
+   "execution_count": 357,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 358,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/mercury+json": {
        "allow_download": true,
+       "code_uid": "App.0.40.24.1-randd9fe9ae5",
        "continuous_update": false,
        "description": "Recumpute everything dynamically",
        "full_screen": true,
   },
   {
    "cell_type": "code",
+   "execution_count": 359,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 360,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/mercury+json": {
+       "code_uid": "Text.0.40.15.11-randec98731b",
        "disabled": false,
        "hidden": false,
        "label": "Test Size Ratio",
+       "model_id": "2157a02ec6544d86bd12bf1e3a15f65e",
        "rows": 1,
        "url_key": "",
        "value": "0.25",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2157a02ec6544d86bd12bf1e3a15f65e",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/mercury+json": {
+       "code_uid": "Text.0.40.15.14-randfa24ca10",
        "disabled": false,
        "hidden": false,
        "label": "Random State Integer",
+       "model_id": "cdaf85c404494bae95a32286425b9034",
        "rows": 1,
        "url_key": "",
        "value": "13",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cdaf85c404494bae95a32286425b9034",
        "version_major": 2,
        "version_minor": 0
       },
   },
   {
    "cell_type": "code",
+   "execution_count": 361,
    "metadata": {},
    "outputs": [],
    "source": [
     "def columns_to_drop(df,drop_duplicates='yes', missing_values_threshold=100, variance_threshold=0, \n",
     "                   correlation_threshold=1.1):\n",
     "   \n",
+    "   global feature_removal_report0\n",
+    "   global feature_removal_report1\n",
+    "   global feature_removal_report2\n",
+    "   global feature_removal_report3\n",
+    "   global feature_removal_report4\n",
+    "   global feature_removal_report5\n",
+    "   global feature_removal_report6\n",
     "   \n",
+    "  \n",
+    "   feature_removal_report0 = 'Shape of the dataframe is:' , df.shape\n",
     "\n",
     "    # Drop duplicated columns\n",
     "   if drop_duplicates == 'yes':\n",
     "      new_column_names = df.columns\n",
     "      df = df.T.drop_duplicates().T\n",
+    "      feature_removal_report1 = 'the number of columns dropped due to duplications is: ', len(new_column_names) - len(df.columns)\n",
     "      drop_duplicated = list(set(new_column_names) - set(df.columns))\n",
     "\n",
     "   elif drop_duplicates == 'no':\n",
     "      df = df.T.T\n",
+    "      feature_removal_report1 = 'No columns were dropped due to duplications' \n",
     "\n",
     "   # Print the percentage of columns in df with missing values more than or equal to threshold\n",
+    "   feature_removal_report2 = 'the number of columns dropped due to missing values is: ', len(df.isnull().mean()[df.isnull().mean() > missing_values_threshold/100].index)\n",
     "      \n",
     "   # Print into a list the columns to be dropped due to missing values\n",
     "   drop_missing = list(df.isnull().mean()[df.isnull().mean() > missing_values_threshold/100].index)\n",
     "   df.drop(drop_missing, axis=1, inplace=True)\n",
     "    \n",
     "   # Print the number of columns in df with variance less than threshold\n",
+    "   feature_removal_report3 = 'the number of columns dropped due to low variance is: ', len(df.var()[df.var() <= variance_threshold].index)\n",
     "\n",
     "   # Print into a list the columns to be dropped due to low variance\n",
     "   drop_variance = list(df.var()[df.var() <= variance_threshold].index)\n",
     "   corr_matrix = df.corr().abs().round(4)\n",
     "   upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n",
     "   to_drop = [column for column in upper.columns if any(upper[column] >= correlation_threshold)]\n",
+    "   feature_removal_report4 = 'the number of columns dropped due to high correlation is: ', len(to_drop)\n",
     "\n",
     "   # Print into a list the columns to be dropped due to high correlation\n",
     "   drop_correlation = [column for column in upper.columns if any(upper[column] >= correlation_threshold)]\n",
     "   elif drop_duplicates =='no':\n",
     "      dropped = (drop_missing+drop_variance+drop_correlation)\n",
     "   \n",
+    "   feature_removal_report5 = 'Total number of columns to be dropped is: ', len(dropped)\n",
+    "   feature_removal_report6 = 'New shape of the dataframe is: ', df.shape\n",
     "\n",
     "   global drop_duplicates_var\n",
     "   drop_duplicates_var = drop_duplicates\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 362,
    "metadata": {},
    "outputs": [],
    "source": [
     "def outlier_removal(z_df, z_threshold=4):\n",
     "    \n",
     "    global outlier_var\n",
+    "    global outlier_removal_report0\n",
+    "    global outlier_removal_report1\n",
     "\n",
     "\n",
     "    if z_threshold == 'none':\n",
+    "        outlier_removal_report0 = 'No outliers were removed'\n",
     "        outlier_var = 'none'\n",
     "        return z_df\n",
     "        \n",
     "    else:\n",
+    "        outlier_removal_report0 = 'The z-score threshold is:', z_threshold\n",
     "\n",
     "        z_df_copy = z_df.copy()\n",
     "\n",
     "        z_df_copy[outliers_mask] = np.nan\n",
     "\n",
     "        outliers_count = np.count_nonzero(outliers_mask)\n",
+    "        outlier_removal_report1 = 'The number of outliers removed from the dataset is:', outliers_count\n",
     "\n",
     "        outlier_var = z_threshold\n",
     "\n",
     "    return z_df_copy"
    ]
   },
   },
   {
    "cell_type": "code",
+   "execution_count": 363,
    "metadata": {},
    "outputs": [],
    "source": [
     "def scale_dataframe(scale_model,df_fit, df_transform):\n",
     "    \n",
     "    global scale_model_var\n",
+    "    global scaling_report0\n",
     "\n",
     "    if scale_model == 'robust':\n",
     "        from sklearn.preprocessing import RobustScaler\n",
     "        scaler.fit(df_fit)\n",
     "        df_scaled = scaler.transform(df_transform)\n",
     "        df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
+    "        scaling_report0 = 'The dataframe has been scaled using the robust scaling model'\n",
     "        scale_model_var = 'robust'\n",
     "        return df_scaled\n",
     "    \n",
     "        scaler.fit(df_fit)\n",
     "        df_scaled = scaler.transform(df_transform)\n",
     "        df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
+    "        scaling_report0 = 'The dataframe has been scaled using the standard scaling model'\n",
     "        scale_model_var = 'standard'\n",
     "        return df_scaled\n",
     "    \n",
     "        scaler.fit(df_fit)\n",
     "        df_scaled = scaler.transform(df_transform)\n",
     "        df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
+    "        scaling_report0 = 'The dataframe has been scaled using the normal scaling model'\n",
     "        scale_model_var = 'normal'\n",
     "        return df_scaled\n",
     "    \n",
     "        scaler.fit(df_fit)\n",
     "        df_scaled = scaler.transform(df_transform)\n",
     "        df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n",
+    "        scaling_report0 = 'The dataframe has been scaled using the minmax scaling model'\n",
     "        scale_model_var = 'minmax'\n",
     "        return df_scaled\n",
     "    \n",
     "    elif scale_model == 'none':\n",
+    "        scaling_report0 = 'The dataframe has not been scaled'\n",
     "        scale_model_var = 'none'\n",
     "        return df_transform\n",
     "    \n",
   },
   {
    "cell_type": "code",
+   "execution_count": 364,
    "metadata": {},
    "outputs": [],
    "source": [
     "\n",
     "def impute_missing_values(imputation, df_fit, df_transform, n_neighbors=5):\n",
     "\n",
     "    global imputation_var\n",
+    "    global imputation_report0\n",
+    "    global imputation_report1\n",
+    "    global imputation_report2\n",
+    "    global imputation_report3\n",
+    "\n",
+    "    imputation_report0 = 'Number of missing values before imputation: ', df_transform.isnull().sum().sum()\n",
+    "\n",
     "\n",
     "    if imputation == 'knn':\n",
     "\n",
     "        imputer.fit(df_fit)\n",
     "        df_imputed = imputer.transform(df_transform)\n",
     "        df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
+    "        imputation_report1 = 'knn imputation has been applied'        \n",
+    "        imputation_report2 = 'Number of missing values after imputation: ', df_imputed.isnull().sum().sum()\n",
     "        imputation_var = 'knn'\n",
     "        return df_imputed\n",
     "    \n",
     "        imputer.fit(df_fit)\n",
     "        df_imputed = imputer.transform(df_transform)\n",
     "        df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
+    "        imputation_report1 = 'mean imputation has been applied'\n",
+    "        imputation_report2 = 'Number of missing values after imputation: ', df_imputed.isnull().sum().sum()\n",
     "        imputation_var = 'mean'\n",
     "        return df_imputed\n",
     "    \n",
     "        imputer.fit(df_fit)\n",
     "        df_imputed = imputer.transform(df_transform)\n",
     "        df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
+    "        imputation_report1 = 'median imputation has been applied'\n",
+    "        imputation_report2 = 'Number of missing values after imputation: ', df_imputed.isnull().sum().sum()\n",
     "        imputation_var = 'median'\n",
     "        return df_imputed\n",
     "    \n",
     "        imputer.fit(df_fit)\n",
     "        df_imputed = imputer.transform(df_transform)\n",
     "        df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n",
+    "        imputation_report1 = 'most frequent imputation has been applied'\n",
+    "        imputation_report2 = 'Number of missing values after imputation: ', df_imputed.isnull().sum().sum()\n",
     "        imputation_var = 'most_frequent'\n",
     "        return df_imputed\n",
     "    \n",
   },
   {
    "cell_type": "code",
+   "execution_count": 365,
    "metadata": {},
    "outputs": [],
    "source": [
     "\n",
     "    global feature_selection_var\n",
     "    global selected_features\n",
+    "    \n",
+    "    global feature_selection_report0\n",
+    "    global feature_selection_report1\n",
     "\n",
     "\n",
     "    # if method is boruta, run boruta feature selection and return the selected features and the training set with only the selected features\n",
     "\n",
     "    if method == 'boruta':\n",
+    "        feature_selection_report0 = 'Selected method is: ', method\n",
     "        from boruta import BorutaPy\n",
     "        from sklearn.ensemble import RandomForestClassifier\n",
     "        rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)\n",
     "        selected_feature_indices = boruta_selector.support_\n",
     "        selected_columns = X_train.columns[selected_feature_indices]\n",
     "        X_train_filtered = X_train.iloc[:, selected_feature_indices]\n",
+    "        feature_selection_report1 = 'Shape of the training set after feature selection with Boruta: ', X_train_filtered.shape\n",
     "        return X_train_filtered, selected_columns\n",
     "    \n",
     "    if method == 'none':\n",
+    "        feature_selection_report = 'No feature selection has been applied'\n",
     "        X_train_filtered = X_train\n",
+    "        feature_selection_report = 'Shape of the training set after no feature selection: ', X_train_filtered.shape\n",
     "        feature_selection_var = 'none'\n",
     "        selected_features = X_train_filtered.columns\n",
     "        return X_train_filtered, selected_features        \n",
     "    \n",
     "    if method == 'lasso':\n",
+    "        feature_selection_report0 = 'Selected method is: ', method\n",
     "        from sklearn.linear_model import LassoCV\n",
     "        from sklearn.feature_selection import SelectFromModel\n",
     "        lasso = LassoCV().fit(X_train, y_train)\n",
     "        model = SelectFromModel(lasso, prefit=True)\n",
     "        X_train_filtered = model.transform(X_train)\n",
     "        selected_features = X_train.columns[model.get_support()]\n",
+    "        feature_selection_report1 = 'Shape of the training set after feature selection with LassoCV: ', X_train_filtered.shape\n",
     "        feature_selection_var = 'lasso'\n",
     "        return X_train_filtered, selected_features\n",
     "    \n",
     "    if method == 'pca':\n",
+    "        feature_selection_report0 = 'Selected method is: ', method\n",
     "        from sklearn.decomposition import PCA\n",
     "        pca = PCA(n_components=15)\n",
     "        X_train_pca = pca.fit_transform(X_train)\n",
     "        selected_features = X_train.columns[pca.explained_variance_ratio_.argsort()[::-1]][:15]\n",
+    "        feature_selection_report1 = 'Shape of the training set after feature selection with PCA: ', X_train_pca.shape\n",
     "        feature_selection_var = 'pca'\n",
     "        return X_train_pca, selected_features\n",
     "    \n",
     "    if method == 'rfe':\n",
+    "        feature_selection_report0 = 'Selected method is: ', method\n",
     "        from sklearn.feature_selection import RFE\n",
     "        from sklearn.ensemble import RandomForestClassifier\n",
     "        rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, n_jobs=-1), n_features_to_select=15, step=10, verbose=0)\n",
     "        rfe_selector.fit(X_train, y_train)\n",
     "        selected_features = X_train.columns[rfe_selector.support_]\n",
     "        X_train_filtered = X_train.iloc[:, rfe_selector.support_]\n",
+    "        feature_selection_report1 = 'Shape of the training set after feature selection with RFE: ', X_train_filtered.shape\n",
     "        feature_selection_var = 'rfe'\n",
     "        return X_train_filtered, selected_features\n",
     "    "
   },
   {
    "cell_type": "code",
+   "execution_count": 366,
    "metadata": {},
    "outputs": [],
    "source": [
     "def imbalance_treatment(method, X_train, y_train):\n",
     "\n",
     "    global imbalance_var\n",
+    "    global imbalance_report0\n",
+    "    global imbalance_report1\n",
     "\n",
     "    if method == 'smote':        \n",
     "        from imblearn.over_sampling import SMOTE\n",
     "        sm = SMOTE(random_state=42)\n",
     "        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)\n",
+    "        imbalance_report0 = 'Shape of the training set after oversampling with SMOTE: ', X_train_res.shape\n",
+    "        imbalance_report1 = 'Value counts of the target variable after oversampling with SMOTE: \\n', y_train_res.value_counts()\n",
     "        imbalance_var = 'smote'\n",
     "        return X_train_res, y_train_res\n",
     "    \n",
     "        from imblearn.under_sampling import RandomUnderSampler\n",
     "        rus = RandomUnderSampler(random_state=42)\n",
     "        X_train_res, y_train_res = rus.fit_resample(X_train, y_train)\n",
+    "        imbalance_report0 = 'Shape of the training set after undersampling with RandomUnderSampler: ', X_train_res.shape\n",
+    "        imbalance_report1 = 'Value counts of the target variable after undersampling with RandomUnderSampler: \\n', y_train_res.value_counts()\n",
     "        imbalance_var = 'undersampling'\n",
     "        return X_train_res, y_train_res\n",
     "    \n",
     "        from imblearn.over_sampling import RandomOverSampler\n",
     "        ros = RandomOverSampler(random_state=42)\n",
     "        X_train_res, y_train_res = ros.fit_resample(X_train, y_train)\n",
+    "        imbalance_report0 = 'Shape of the training set after oversampling with RandomOverSampler: ', X_train_res.shape\n",
+    "        imbalance_report1 = 'Value counts of the target variable after oversampling with RandomOverSampler: \\n', y_train_res.value_counts()\n",
     "        imbalance_var = 'rose'\n",
     "        return X_train_res, y_train_res\n",
     "    \n",
     "    if method == 'none':\n",
     "        X_train_res = X_train\n",
     "        y_train_res = y_train\n",
+    "        imbalance_report0 = 'Shape of the training set after no resampling: ', X_train_res.shape\n",
+    "        imbalance_report1 = 'Value counts of the target variable after no resampling: \\n', y_train_res.value_counts()\n",
     "        imbalance_var = 'none'\n",
     "        return X_train_res, y_train_res\n",
     "    \n",
   },
   {
    "cell_type": "code",
+   "execution_count": 367,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 368,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 369,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 370,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/mercury+json": {
+       "code_uid": "Text.0.40.15.8-rand4a43baec",
        "disabled": false,
        "hidden": false,
        "label": "Missing Value Threeshold",
+       "model_id": "b2736e53364e4041b6ce10b9e1e1f7d8",
        "rows": 1,
        "url_key": "",
        "value": "50",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b2736e53364e4041b6ce10b9e1e1f7d8",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/mercury+json": {
+       "code_uid": "Text.0.40.15.11-rand6f838484",
        "disabled": false,
        "hidden": false,
        "label": "Variance Threshold",
+       "model_id": "97419c4a49954b8490aa311870d010b9",
        "rows": 1,
        "url_key": "",
        "value": "0.05",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "97419c4a49954b8490aa311870d010b9",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/mercury+json": {
+       "code_uid": "Text.0.40.15.14-rand6243cbfa",
        "disabled": false,
        "hidden": false,
        "label": "Correlation Threshold",
+       "model_id": "e9f072dfb6a241bca69f960fa0aa06a1",
        "rows": 1,
        "url_key": "",
        "value": "0.95",
        "widget": "Text"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e9f072dfb6a241bca69f960fa0aa06a1",
        "version_major": 2,
        "version_minor": 0
       },
         4,
         5
        ],
+       "code_uid": "Select.0.40.16.18-randa184b437",
        "disabled": false,
        "hidden": false,
        "label": "Outlier Removal Threshold",
+       "model_id": "0be493385a154210b3c7685a3bd1074f",
        "url_key": "",
        "value": 5,
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0be493385a154210b3c7685a3bd1074f",
        "version_major": 2,
        "version_minor": 0
       },
         "minmax",
         "robust"
        ],
+       "code_uid": "Select.0.40.16.25-rand163d8992",
        "disabled": false,
        "hidden": false,
        "label": "Scaling Variables",
+       "model_id": "985eab871677416f9c14ea528b0fd561",
        "url_key": "",
        "value": "standard",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "985eab871677416f9c14ea528b0fd561",
        "version_major": 2,
        "version_minor": 0
       },
         "knn",
         "most_frequent"
        ],
+       "code_uid": "Select.0.40.16.29-randb76d7c1d",
        "disabled": false,
        "hidden": false,
        "label": "Imputation Methods",
+       "model_id": "eef6b42e02914c98b7e7ed8d0a18df98",
        "url_key": "",
        "value": "median",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "eef6b42e02914c98b7e7ed8d0a18df98",
        "version_major": 2,
        "version_minor": 0
       },
         "pca",
         "boruta"
        ],
+       "code_uid": "Select.0.40.16.34-rand254bd909",
        "disabled": false,
        "hidden": false,
        "label": "Feature Selection",
+       "model_id": "f4fc58b330a24bfe8699e0602178b0e1",
        "url_key": "",
        "value": "lasso",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f4fc58b330a24bfe8699e0602178b0e1",
        "version_major": 2,
        "version_minor": 0
       },
         "undersampling",
         "rose"
        ],
+       "code_uid": "Select.0.40.16.38-rand75e4d938",
        "disabled": false,
        "hidden": false,
        "label": "Imbalance Treatment",
+       "model_id": "965a81a69265473a830f8eec5e8ba2df",
        "url_key": "",
        "value": "smote",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "965a81a69265473a830f8eec5e8ba2df",
        "version_major": 2,
        "version_minor": 0
       },
         "decision_tree",
         "xgboost"
        ],
+       "code_uid": "Select.0.40.16.42-rand1bbd78ac",
        "disabled": false,
        "hidden": false,
        "label": "Model Selection",
+       "model_id": "0d1b1477e14b44b99d00dc89dffb70cb",
        "url_key": "",
        "value": "random_forest",
        "widget": "Select"
       },
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0d1b1477e14b44b99d00dc89dffb70cb",
        "version_major": 2,
        "version_minor": 0
       },
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "<class 'list'>\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 371,
    "metadata": {},
    "outputs": [
     {
      "text": [
       "--------------------------------------------------\n"
      ]
     }
    ],
    "source": [
   {
    "attachments": {},
    "cell_type": "markdown",
+   "metadata": {},
    "source": [
     "#### **Confusion Matrix**"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 372,
+   "metadata": {},
    "outputs": [
     {
      "data": {
     "\n",
     "display(evaluation_score_output[['Accuracy', 'Precision', 'Recall', 'F1-score']])"
    ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### **Transformations Report**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 373,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "------------------------------------------\n",
+      "FEATURE REMOVAL\n",
+      "('Shape of the dataframe is:', (1175, 590))\n",
+      "('the number of columns dropped due to duplications is: ', 104)\n",
+      "('the number of columns dropped due to missing values is: ', 28)\n",
+      "('the number of columns dropped due to low variance is: ', 189)\n",
+      "('the number of columns dropped due to high correlation is: ', 90)\n",
+      "('Total number of columns to be dropped is: ', 411)\n",
+      "('New shape of the dataframe is: ', (1175, 179))\n",
+      "------------------------------------------\n",
+      "OUTLIER REMOVAL\n",
+      "('The z-score threshold is:', 5)\n",
+      "('The number of outliers removed from the dataset is:', 163)\n",
+      "------------------------------------------\n",
+      "SCALING\n",
+      "The dataframe has been scaled using the standard scaling model\n",
+      "------------------------------------------\n",
+      "IMPUTATION\n",
+      "('Number of missing values before imputation: ', 1196)\n",
+      "median imputation has been applied\n",
+      "('Number of missing values after imputation: ', 0)\n",
+      "------------------------------------------\n",
+      "FEATURE SELECTION\n",
+      "('Selected method is: ', 'lasso')\n",
+      "('Shape of the training set after feature selection with LassoCV: ', (1175, 14))\n",
+      "------------------------------------------\n",
+      "IMBALANCE TREATMENT\n",
+      "('Shape of the training set after oversampling with SMOTE: ', (2194, 14))\n",
+      "('Value counts of the target variable after oversampling with SMOTE: \\n', pass/fail\n",
+      "0            1097\n",
+      "1            1097\n",
+      "dtype: int64)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('------------------------------------------')\n",
+    "print('FEATURE REMOVAL')\n",
+    "print(feature_removal_report0)\n",
+    "print(feature_removal_report1)\n",
+    "print(feature_removal_report2)\n",
+    "print(feature_removal_report3)\n",
+    "print(feature_removal_report4)\n",
+    "print(feature_removal_report5)\n",
+    "print(feature_removal_report6)\n",
+    "print('------------------------------------------')\n",
+    "print('OUTLIER REMOVAL')\n",
+    "print(outlier_removal_report0)\n",
+    "print(outlier_removal_report1)\n",
+    "print('------------------------------------------')\n",
+    "print('SCALING')\n",
+    "print(scaling_report0)\n",
+    "print('------------------------------------------')\n",
+    "print('IMPUTATION')\n",
+    "print(imputation_report0)\n",
+    "print(imputation_report1)\n",
+    "print(imputation_report2)\n",
+    "print('------------------------------------------')\n",
+    "print('FEATURE SELECTION')\n",
+    "print(feature_selection_report0)\n",
+    "print(feature_selection_report1)\n",
+    "print('------------------------------------------')\n",
+    "print('IMBALANCE TREATMENT')\n",
+    "print(imbalance_report0)\n",
+    "print(imbalance_report1)"
+   ]
   }
  ],
  "metadata": {