{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "# **Classifying products in Semiconductor Industry**" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "#### **Import the data**" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "import mercury as mr\n", "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from scipy import stats\n", "from sklearn.model_selection import train_test_split\n", "\n", "from mlxtend.plotting import plot_confusion_matrix\n", "from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score\n", "from mlxtend.plotting import plot_confusion_matrix" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [ { "data": { "application/mercury+json": { "allow_download": true, "code_uid": "App.0.40.24.1-rand53016c34", "continuous_update": false, "description": "Recumpute everything dynamically", "full_screen": true, "model_id": "mercury-app", "notify": "{}", "output": "app", "schedule": "", "show_code": false, "show_prompt": false, "show_sidebar": true, "static_notebook": false, "title": "Secom Web App Demo", "widget": "App" }, "text/html": [ "

Mercury Application

This output won't appear in the web app." ], "text/plain": [ "mercury.App" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "app = mr.App(title=\"Secom Web App Demo\", description=\"Recumpute everything dynamically\", continuous_update=False)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "# Read the features data from the the url of csv into pandas dataframes and rename the columns to F1, F2, F3, etc.\n", "# Read the labels data from the url of csv into pandas dataframes and rename the columns to pass/fail and date/time\n", "\n", "#url_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data'\n", "#url_labels = 'https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data'\n", "\n", "url_data = 'secom_data.csv'\n", "url_labels = 'secom_labels.csv'\n", "\n", "features = pd.read_csv(url_data, delimiter=' ', header=None)\n", "labels = pd.read_csv(url_labels, delimiter=' ', names=['pass/fail', 'date_time'])\n", "\n", "prefix = 'F'\n", "new_column_names = [prefix + str(i) for i in range(1, len(features.columns)+1)]\n", "features.columns = new_column_names\n", "\n", "labels['pass/fail'] = labels['pass/fail'].replace({-1: 0, 1: 1})\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "#### **Split the data**" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [ { "data": { "application/mercury+json": { "code_uid": "Text.0.40.15.11-rand81961de4", "disabled": false, "hidden": false, "label": "Test Size Ratio", "model_id": "cddcc5c10139484dbc19e59ce26f012c", "rows": 1, "url_key": "", "value": "0.25", "widget": "Text" }, "application/vnd.jupyter.widget-view+json": { "model_id": "cddcc5c10139484dbc19e59ce26f012c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Text" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/mercury+json": { "code_uid": "Text.0.40.15.14-rand72283006", "disabled": false, "hidden": false, "label": "Random State Integer", "model_id": "dcac3f415e624b61aac8a3578e285bca", "rows": 1, "url_key": "", "value": "13", "widget": "Text" }, "application/vnd.jupyter.widget-view+json": { "model_id": "dcac3f415e624b61aac8a3578e285bca", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Text" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# if there is a date/time column, drop it from the features and labels dataframes, else continue\n", "\n", "if 'date_time' in labels.columns:\n", " labels = labels.drop(['date_time'], axis=1)\n", "\n", "\n", "# Split the dataset and the labels into training and testing sets\n", "# use stratify to ensure that the training and testing sets have the same percentage of pass and fail labels\n", "# use random_state to ensure that the same random split is generated each time the code is run\n", "\n", "test_size_num = mr.Text(label=\"Test Size Ratio\", value='0.25') # \n", "test_size_num = float(test_size_num.value)\n", "\n", "random_state_num = mr.Text(label=\"Random State Integer\", value='13') # \n", "random_state_num = int(random_state_num.value)\n", "\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " features, labels, test_size = test_size_num, stratify=labels, random_state=random_state_num)\n", "\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "#### **Feature Removal**" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "def columns_to_drop(df,drop_duplicates='yes', missing_values_threshold=100, variance_threshold=0, \n", " correlation_threshold=1.1):\n", " \n", " global feature_removal_report0\n", " global feature_removal_report1\n", " global feature_removal_report2\n", " global feature_removal_report3\n", " global feature_removal_report4\n", " global feature_removal_report5\n", " global feature_removal_report6\n", " \n", " \n", " feature_removal_report0 = 'Shape of the dataframe is:' , df.shape\n", "\n", " # Drop duplicated columns\n", " if drop_duplicates == 'yes':\n", " new_column_names = df.columns\n", " df = df.T.drop_duplicates().T\n", " feature_removal_report1 = 'the number of columns dropped due to duplications is: ', len(new_column_names) - len(df.columns)\n", " drop_duplicated = list(set(new_column_names) - set(df.columns))\n", "\n", " elif drop_duplicates == 'no':\n", " df = df.T.T\n", " feature_removal_report1 = 'No columns were dropped due to duplications' \n", "\n", " # Print the percentage of columns in df with missing values more than or equal to threshold\n", " feature_removal_report2 = 'the number of columns dropped due to missing values is: ', len(df.isnull().mean()[df.isnull().mean() > missing_values_threshold/100].index)\n", " \n", " # Print into a list the columns to be dropped due to missing values\n", " drop_missing = list(df.isnull().mean()[df.isnull().mean() > missing_values_threshold/100].index)\n", "\n", " # Drop columns with more than or equal to threshold missing values from df\n", " df.drop(drop_missing, axis=1, inplace=True)\n", " \n", " # Print the number of columns in df with variance less than threshold\n", " feature_removal_report3 = 'the number of columns dropped due to low variance is: ', len(df.var()[df.var() <= variance_threshold].index)\n", "\n", " # Print into a list the columns to be dropped due to low variance\n", " drop_variance = list(df.var()[df.var() <= variance_threshold].index)\n", "\n", " # Drop columns with more than or equal to threshold variance from df\n", " df.drop(drop_variance, axis=1, inplace=True)\n", "\n", " # Print the number of columns in df with more than or equal to threshold correlation\n", " \n", " # Create correlation matrix and round it to 4 decimal places\n", " corr_matrix = df.corr().abs().round(4)\n", " upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n", " to_drop = [column for column in upper.columns if any(upper[column] >= correlation_threshold)]\n", " feature_removal_report4 = 'the number of columns dropped due to high correlation is: ', len(to_drop)\n", "\n", " # Print into a list the columns to be dropped due to high correlation\n", " drop_correlation = [column for column in upper.columns if any(upper[column] >= correlation_threshold)]\n", "\n", " # Drop columns with more than or equal to threshold correlation from df\n", " df.drop(to_drop, axis=1, inplace=True)\n", " \n", " if drop_duplicates == 'yes':\n", " dropped = (drop_duplicated+drop_missing+drop_variance+drop_correlation)\n", "\n", " elif drop_duplicates =='no':\n", " dropped = (drop_missing+drop_variance+drop_correlation)\n", " \n", " feature_removal_report5 = 'Total number of columns to be dropped is: ', len(dropped)\n", " feature_removal_report6 = 'New shape of the dataframe is: ', df.shape\n", "\n", " global drop_duplicates_var\n", " drop_duplicates_var = drop_duplicates\n", " \n", " global missing_values_threshold_var\n", " missing_values_threshold_var = missing_values_threshold\n", "\n", " global variance_threshold_var\n", " variance_threshold_var = variance_threshold\n", "\n", " global correlation_threshold_var\n", " correlation_threshold_var = correlation_threshold\n", " \n", " return dropped" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "#### **Outlier Removal**" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "def outlier_removal(z_df, action = 'ignore', z_threshold=3):\n", " \n", " global outlier_var\n", " global outlier_removal_report0\n", " global outlier_removal_report1\n", "\n", " if action == 'ignore':\n", " outlier_removal_report0 = 'No z-score threshold was selected'\n", " outlier_var = 'none'\n", " z_df_copy = z_df.copy()\n", " outlier_removal_report1 = 'No outliers were removed from the dataset'\n", " \n", " if action == 'remove':\n", " \n", " outlier_removal_report0 = 'The z-score threshold is:', z_threshold\n", "\n", " z_df_copy = z_df.copy()\n", "\n", " z_scores = np.abs(stats.zscore(z_df_copy))\n", "\n", " # Identify the outliers in the dataset using the z-score method\n", " outliers_mask = z_scores > z_threshold\n", " z_df_copy[outliers_mask] = np.nan\n", "\n", " outliers_count = np.count_nonzero(outliers_mask)\n", " outlier_removal_report1 = 'The number of outliers removed from the dataset is:', outliers_count\n", "\n", " outlier_var = z_threshold\n", "\n", " if action == 'push':\n", "\n", " # push the outliers to the threshold value\n", " outlier_removal_report0 = 'The z-score threshold is:', z_threshold\n", "\n", " z_df_copy = z_df.copy()\n", "\n", " z_scores = np.abs(stats.zscore(z_df_copy))\n", "\n", " # Identify the outliers in the dataset using the z-score method\n", " outliers_mask = z_scores > z_threshold\n", " z_df_copy[outliers_mask] = np.sign(z_df_copy[outliers_mask]) * (3 * np.std(z_df_copy)) + np.mean(z_df_copy)\n", "\n", " outliers_count = np.count_nonzero(outliers_mask)\n", " outlier_removal_report1 = 'The number of outliers pushed to the boundaries is:', outliers_count\n", "\n", " outlier_var = str(action) + '-' + str(z_threshold) + 's'\n", "\n", " \n", " return z_df_copy" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "#### **Scaling Methods**" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "# define a function to scale the dataframe using different scaling models\n", "\n", "def scale_dataframe(scale_model,df_fit, df_transform):\n", " \n", " global scale_model_var\n", " global scaling_report0\n", "\n", " if scale_model == 'robust':\n", " from sklearn.preprocessing import RobustScaler\n", " scaler = RobustScaler()\n", " scaler.fit(df_fit)\n", " df_scaled = scaler.transform(df_transform)\n", " df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n", " scaling_report0 = 'The dataframe has been scaled using the robust scaling model'\n", " scale_model_var = 'robust'\n", " return df_scaled\n", " \n", " elif scale_model == 'standard':\n", " from sklearn.preprocessing import StandardScaler\n", " scaler = StandardScaler()\n", " scaler.fit(df_fit)\n", " df_scaled = scaler.transform(df_transform)\n", " df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n", " scaling_report0 = 'The dataframe has been scaled using the standard scaling model'\n", " scale_model_var = 'standard'\n", " return df_scaled\n", " \n", " elif scale_model == 'normal':\n", " from sklearn.preprocessing import Normalizer\n", " scaler = Normalizer()\n", " scaler.fit(df_fit)\n", " df_scaled = scaler.transform(df_transform)\n", " df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n", " scaling_report0 = 'The dataframe has been scaled using the normal scaling model'\n", " scale_model_var = 'normal'\n", " return df_scaled\n", " \n", " elif scale_model == 'minmax':\n", " from sklearn.preprocessing import MinMaxScaler\n", " scaler = MinMaxScaler()\n", " scaler.fit(df_fit)\n", " df_scaled = scaler.transform(df_transform)\n", " df_scaled = pd.DataFrame(df_scaled, columns=df_transform.columns)\n", " scaling_report0 = 'The dataframe has been scaled using the minmax scaling model'\n", " scale_model_var = 'minmax'\n", " return df_scaled\n", " \n", " elif scale_model == 'none':\n", " scaling_report0 = 'The dataframe has not been scaled'\n", " scale_model_var = 'none'\n", " return df_transform\n", " \n", " else:\n", " print('Please choose a valid scaling model: robust, standard, normal, or minmax')\n", " return None" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "#### **Missing Value Imputation**" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "# define a function to impute missing values using different imputation models\n", "\n", "def impute_missing_values(imputation, df_fit, df_transform, n_neighbors=5):\n", "\n", " global imputation_var\n", " global imputation_report0\n", " global imputation_report1\n", " global imputation_report2\n", " \n", "\n", " imputation_report0 = 'Number of missing values before imputation: ', df_transform.isnull().sum().sum()\n", "\n", "\n", " if imputation == 'knn':\n", "\n", " from sklearn.impute import KNNImputer\n", " imputer = KNNImputer(n_neighbors=n_neighbors)\n", " imputer.fit(df_fit)\n", " df_imputed = imputer.transform(df_transform)\n", " df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n", " imputation_report1 = 'knn imputation has been applied' \n", " imputation_report2 = 'Number of missing values after imputation: ', df_imputed.isnull().sum().sum()\n", " imputation_var = 'knn'\n", " return df_imputed\n", " \n", " elif imputation == 'mean':\n", "\n", " from sklearn.impute import SimpleImputer\n", " imputer = SimpleImputer(strategy='mean')\n", " imputer.fit(df_fit)\n", " df_imputed = imputer.transform(df_transform)\n", " df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n", " imputation_report1 = 'mean imputation has been applied'\n", " imputation_report2 = 'Number of missing values after imputation: ', df_imputed.isnull().sum().sum()\n", " imputation_var = 'mean'\n", " return df_imputed\n", " \n", " elif imputation == 'median':\n", "\n", " from sklearn.impute import SimpleImputer\n", " imputer = SimpleImputer(strategy='median')\n", " imputer.fit(df_fit)\n", " df_imputed = imputer.transform(df_transform)\n", " df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n", " imputation_report1 = 'median imputation has been applied'\n", " imputation_report2 = 'Number of missing values after imputation: ', df_imputed.isnull().sum().sum()\n", " imputation_var = 'median'\n", " return df_imputed\n", " \n", " elif imputation == 'most_frequent':\n", " \n", " from sklearn.impute import SimpleImputer\n", " imputer = SimpleImputer(strategy='most_frequent')\n", " imputer.fit(df_fit)\n", " df_imputed = imputer.transform(df_transform)\n", " df_imputed = pd.DataFrame(df_imputed, columns=df_transform.columns)\n", " imputation_report1 = 'most frequent imputation has been applied'\n", " imputation_report2 = 'Number of missing values after imputation: ', df_imputed.isnull().sum().sum()\n", " imputation_var = 'most_frequent'\n", " return df_imputed\n", " \n", " else:\n", " print('Please choose an imputation model from the following: knn, mean, median, most_frequent')\n", " df_imputed = df_transform.copy()\n", " return df_imputed\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "#### **Feature Reduction / Selection**" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "def feature_selection(method, X_train, y_train):\n", "\n", " global feature_selection_var\n", " global selected_features\n", " \n", " global feature_selection_report0\n", " global feature_selection_report1\n", "\n", "\n", " # if method is boruta, run boruta feature selection and return the selected features and the training set with only the selected features\n", "\n", " if method == 'boruta':\n", " feature_selection_report0 = 'Selected method is: ', method\n", " from boruta import BorutaPy\n", " from sklearn.ensemble import RandomForestClassifier\n", " rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)\n", " boruta_selector = BorutaPy(rf,n_estimators='auto', verbose=0, random_state=42)\n", " boruta_selector.fit(X_train.values, y_train.values.ravel())\n", " selected_feature_indices = boruta_selector.support_\n", " selected_columns = X_train.columns[selected_feature_indices]\n", " X_train_filtered = X_train.iloc[:, selected_feature_indices]\n", " feature_selection_report1 = 'Shape of the training set after feature selection with Boruta: ', X_train_filtered.shape\n", " return X_train_filtered, selected_columns\n", " \n", " if method == 'none':\n", " feature_selection_report0 = 'No feature selection has been applied'\n", " X_train_filtered = X_train\n", " feature_selection_report1 = 'Shape of the training set after no feature selection: ', X_train_filtered.shape\n", " feature_selection_var = 'none'\n", " selected_features = X_train_filtered.columns\n", " return X_train_filtered, selected_features \n", " \n", " if method == 'lasso':\n", " feature_selection_report0 = 'Selected method is: ', method\n", " from sklearn.linear_model import LassoCV\n", " from sklearn.feature_selection import SelectFromModel\n", " lasso = LassoCV().fit(X_train, y_train)\n", " model = SelectFromModel(lasso, prefit=True)\n", " X_train_filtered = model.transform(X_train)\n", " selected_features = X_train.columns[model.get_support()]\n", " feature_selection_report1 = 'Shape of the training set after feature selection with LassoCV: ', X_train_filtered.shape\n", " feature_selection_var = 'lasso'\n", " return X_train_filtered, selected_features\n", " \n", " if method == 'pca':\n", " feature_selection_report0 = 'Selected method is: ', method\n", " from sklearn.decomposition import PCA\n", " pca = PCA(n_components=15)\n", " X_train_pca = pca.fit_transform(X_train)\n", " selected_features = X_train.columns[pca.explained_variance_ratio_.argsort()[::-1]][:15]\n", " feature_selection_report1 = 'Shape of the training set after feature selection with PCA: ', X_train_pca.shape\n", " feature_selection_var = 'pca'\n", " return X_train_pca, selected_features\n", " \n", " if method == 'rfe':\n", " feature_selection_report0 = 'Selected method is: ', method\n", " from sklearn.feature_selection import RFE\n", " from sklearn.ensemble import RandomForestClassifier\n", " rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, n_jobs=-1), n_features_to_select=15, step=10, verbose=0)\n", " rfe_selector.fit(X_train, y_train)\n", " selected_features = X_train.columns[rfe_selector.support_]\n", " X_train_filtered = X_train.iloc[:, rfe_selector.support_]\n", " feature_selection_report1 = 'Shape of the training set after feature selection with RFE: ', X_train_filtered.shape\n", " feature_selection_var = 'rfe'\n", " return X_train_filtered, selected_features\n", " " ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "#### **Imbalance Treatment**" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "#define a function to oversample and understamble the imbalance in the training set\n", "\n", "def imbalance_treatment(method, X_train, y_train):\n", "\n", " global imbalance_var\n", " global imbalance_report0\n", " global imbalance_report1\n", " global imbalance_report2\n", " \n", "\n", "\n", " if method == 'smote': \n", " from imblearn.over_sampling import SMOTE\n", " sm = SMOTE(random_state=42)\n", " X_train_res, y_train_res = sm.fit_resample(X_train, y_train)\n", " imbalance_report0 = 'Shape of the training set after oversampling with SMOTE: ', X_train_res.shape\n", " imbalance_report1 = 'Value counts of the target variable after oversampling with SMOTE: '\n", " imbalance_report2 = y_train_res.value_counts()\n", " imbalance_var = 'smote'\n", " return X_train_res, y_train_res\n", " \n", " if method == 'undersampling':\n", " from imblearn.under_sampling import RandomUnderSampler\n", " rus = RandomUnderSampler(random_state=42)\n", " X_train_res, y_train_res = rus.fit_resample(X_train, y_train)\n", " imbalance_report0 = 'Shape of the training set after undersampling with RandomUnderSampler: ', X_train_res.shape\n", " imbalance_report1 = 'Value counts of the target variable after undersampling with RandomUnderSampler: '\n", " imbalance_report2 = y_train_res.value_counts()\n", " imbalance_var = 'undersampling'\n", " return X_train_res, y_train_res\n", " \n", " if method == 'rose':\n", " from imblearn.over_sampling import RandomOverSampler\n", " ros = RandomOverSampler(random_state=42)\n", " X_train_res, y_train_res = ros.fit_resample(X_train, y_train)\n", " imbalance_report0 = 'Shape of the training set after oversampling with RandomOverSampler: ', X_train_res.shape\n", " imbalance_report1 = 'Value counts of the target variable after oversampling with RandomOverSampler: '\n", " imbalance_report2 = y_train_res.value_counts()\n", " imbalance_var = 'rose'\n", " return X_train_res, y_train_res\n", " \n", " \n", " if method == 'none':\n", " X_train_res = X_train\n", " y_train_res = y_train\n", " imbalance_report0 = 'Shape of the training set after no resampling: ', X_train_res.shape\n", " imbalance_report1 = 'Value counts of the target variable after no resampling: '\n", " imbalance_report2 = y_train_res.value_counts()\n", " imbalance_var = 'none'\n", " return X_train_res, y_train_res\n", " \n", " else:\n", " print('Please choose a valid resampling method: smote, rose, undersampling or none')\n", " X_train_res = X_train\n", " y_train_res = y_train\n", " return X_train_res, y_train_res\n", " \n", " " ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "#### **Training Models**" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "# define a function where you can choose the model you want to use to train the data\n", "\n", "def train_model(model, X_train, y_train, X_test, y_test):\n", "\n", " global model_var\n", "\n", " if model == 'random_forest':\n", " from sklearn.ensemble import RandomForestClassifier\n", " rfc = RandomForestClassifier(n_estimators=100, random_state=13)\n", " rfc.fit(X_train, y_train)\n", " y_pred = rfc.predict(X_test)\n", " model_var = 'random_forest'\n", " return y_pred\n", "\n", " if model == 'logistic_regression':\n", " from sklearn.linear_model import LogisticRegression\n", " lr = LogisticRegression()\n", " lr.fit(X_train, y_train)\n", " y_pred = lr.predict(X_test)\n", " model_var = 'logistic_regression'\n", " return y_pred\n", " \n", " if model == 'knn':\n", " from sklearn.neighbors import KNeighborsClassifier\n", " knn = KNeighborsClassifier(n_neighbors=5)\n", " knn.fit(X_train, y_train)\n", " y_pred = knn.predict(X_test)\n", " model_var = 'knn'\n", " return y_pred\n", " \n", " if model == 'svm':\n", " from sklearn.svm import SVC\n", " svm = SVC()\n", " svm.fit(X_train, y_train)\n", " y_pred = svm.predict(X_test)\n", " model_var = 'svm'\n", " return y_pred\n", " \n", " if model == 'naive_bayes':\n", " from sklearn.naive_bayes import GaussianNB\n", " nb = GaussianNB()\n", " nb.fit(X_train, y_train)\n", " y_pred = nb.predict(X_test)\n", " model_var = 'naive_bayes'\n", " return y_pred\n", " \n", " if model == 'decision_tree':\n", " from sklearn.tree import DecisionTreeClassifier\n", " dt = DecisionTreeClassifier()\n", " dt.fit(X_train, y_train)\n", " y_pred = dt.predict(X_test)\n", " model_var = 'decision_tree'\n", " return y_pred\n", " \n", " if model == 'xgboost':\n", " from xgboost import XGBClassifier\n", " xgb = XGBClassifier()\n", " xgb.fit(X_train, y_train)\n", " y_pred = xgb.predict(X_test)\n", " model_var = 'xgboost'\n", " return y_pred\n", " \n", " else:\n", " print('Please choose a model from the following: random_forest, logistic_regression, knn, svm, naive_bayes, decision_tree, xgboost')\n", " return None" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "evaluation_score_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'model_variables'])\n", "evaluation_count_df = pd.DataFrame(columns=['Model', 'True Negatives', 'False Positives', 'False Negatives', 'True Positives', 'model_variables'])" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "#### **Evaluation Function**" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "#define a function that prints the strings below\n", "def evaluate_models(model='random_forest'):\n", " \n", " all_models = ['random_forest', 'logistic_regression', 'knn', 'svm', 'naive_bayes', 'decision_tree', 'xgboost']\n", " evaluation_score_append = []\n", " evaluation_count_append = []\n", " \n", " for selected_model in all_models:\n", " \n", " if model == 'all' or model == selected_model:\n", "\n", " evaluation_score = []\n", " evaluation_count = []\n", "\n", " y_pred = globals()['y_pred_' + selected_model] # Get the prediction variable dynamically\n", "\n", " def namestr(obj, namespace):\n", " return [name for name in namespace if namespace[name] is obj]\n", "\n", " model_name = namestr(y_pred, globals())[0]\n", " model_name = model_name.replace('y_pred_', '') \n", "\n", " cm = confusion_matrix(y_test, y_pred)\n", "\n", " # create a dataframe with the results for each model\n", "\n", " evaluation_score.append(model_name)\n", " evaluation_score.append(round(accuracy_score(y_test, y_pred), 2))\n", " evaluation_score.append(round(precision_score(y_test, y_pred, zero_division=0), 2))\n", " evaluation_score.append(round(recall_score(y_test, y_pred), 2))\n", " evaluation_score.append(round(f1_score(y_test, y_pred), 2))\n", " evaluation_score_append.append(evaluation_score)\n", "\n", "\n", " # create a dataframe with the true positives, true negatives, false positives and false negatives for each model\n", "\n", " evaluation_count.append(model_name)\n", " evaluation_count.append(cm[0][0])\n", " evaluation_count.append(cm[0][1])\n", " evaluation_count.append(cm[1][0])\n", " evaluation_count.append(cm[1][1])\n", " evaluation_count_append.append(evaluation_count)\n", "\n", " \n", " evaluation_score_append = pd.DataFrame(evaluation_score_append, \n", " columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-score'])\n", " \n", " evaluation_score_append['drop duplicates'] = drop_duplicates_var\n", " evaluation_score_append['missing values th'] = missing_values_threshold_var\n", " evaluation_score_append['variance th'] = variance_threshold_var\n", " evaluation_score_append['correlation th'] = correlation_threshold_var\n", " evaluation_score_append['outlier removal th'] = outlier_var\n", " evaluation_score_append['scaling method'] = scale_model_var\n", " evaluation_score_append['imputation method'] = imputation_var\n", " evaluation_score_append['feature selection'] = feature_selection_var\n", " evaluation_score_append['imbalance treatment'] = imbalance_var\n", "\n", "\n", " evaluation_score_append['model_variables'] = drop_duplicates_var + '_' + str(missing_values_threshold_var) + '_' + str(\n", " variance_threshold_var) + '_' + str(correlation_threshold_var) + '_' + str(\n", " outlier_var) + '_' + scale_model_var + '_' + imputation_var + '_' + feature_selection_var + '_' + imbalance_var\n", " \n", "\n", " evaluation_count_append = pd.DataFrame(evaluation_count_append,\n", " columns=['Model', 'True Negatives', 'False Positives', 'False Negatives', 'True Positives'])\n", " \n", " evaluation_count_append['drop duplicates'] = drop_duplicates_var\n", " evaluation_count_append['missing values th'] = missing_values_threshold_var\n", " evaluation_count_append['variance th'] = variance_threshold_var\n", " evaluation_count_append['correlation th'] = correlation_threshold_var\n", " evaluation_count_append['outlier removal th'] = outlier_var\n", " evaluation_count_append['scaling method'] = scale_model_var\n", " evaluation_count_append['imputation method'] = imputation_var\n", " evaluation_count_append['feature selection'] = feature_selection_var\n", " evaluation_count_append['imbalance treatment'] = imbalance_var\n", " \n", " evaluation_count_append['model_variables'] = drop_duplicates_var + '_' + str(missing_values_threshold_var) + '_' + str(\n", " variance_threshold_var) + '_' + str(correlation_threshold_var) + '_' + str(\n", " outlier_var) + '_' + scale_model_var + '_' + imputation_var + '_' + feature_selection_var + '_' + imbalance_var\n", " \n", " return evaluation_score_append, evaluation_count_append" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "### **Input Variables**" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [ { "data": { "application/mercury+json": { "code_uid": "Text.0.40.15.8-randd265d777", "disabled": false, "hidden": false, "label": "Missing Value Threeshold", "model_id": "aec705cdd896483f9d28d01d5e488a64", "rows": 1, "url_key": "", "value": "50", "widget": "Text" }, "application/vnd.jupyter.widget-view+json": { "model_id": "aec705cdd896483f9d28d01d5e488a64", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Text" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/mercury+json": { "code_uid": "Text.0.40.15.11-rand7024cef0", "disabled": false, "hidden": false, "label": "Variance Threshold", "model_id": "19c44ea2727948d09fa75e29c62844d8", "rows": 1, "url_key": "", "value": "0.05", "widget": "Text" }, "application/vnd.jupyter.widget-view+json": { "model_id": "19c44ea2727948d09fa75e29c62844d8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Text" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/mercury+json": { "code_uid": "Text.0.40.15.14-randa98919fc", "disabled": false, "hidden": false, "label": "Correlation Threshold", "model_id": "dd6d45837f4b45d3a4d58e9821188473", "rows": 1, "url_key": "", "value": "0.95", "widget": "Text" }, "application/vnd.jupyter.widget-view+json": { "model_id": "dd6d45837f4b45d3a4d58e9821188473", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Text" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/mercury+json": { "choices": [ "ignore", "remove", "push" ], "code_uid": "Select.0.40.16.19-rand82f0f224", "disabled": false, "hidden": false, "label": "Outlier Action", "model_id": "021d7973917845d4842f11c6d9f2fc67", "url_key": "", "value": "ignore", "widget": "Select" }, "application/vnd.jupyter.widget-view+json": { "model_id": "021d7973917845d4842f11c6d9f2fc67", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/mercury+json": { "choices": [ "none", 3, 4, 5 ], "code_uid": "Select.0.40.16.22-rand043a8dbf", "disabled": false, "hidden": false, "label": "Outlier Action Threshold", "model_id": "e0e42c0aef7845838da1aa1c70828482", "url_key": "", "value": "none", "widget": "Select" }, "application/vnd.jupyter.widget-view+json": { "model_id": "e0e42c0aef7845838da1aa1c70828482", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/mercury+json": { "choices": [ "none", "standard", "minmax", "robust" ], "code_uid": "Select.0.40.16.29-rand05477265", "disabled": false, "hidden": false, "label": "Scaling Variables", "model_id": "3938af66d22649bfb1cdaa77c4fb9684", "url_key": "", "value": "none", "widget": "Select" }, "application/vnd.jupyter.widget-view+json": { "model_id": "3938af66d22649bfb1cdaa77c4fb9684", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/mercury+json": { "choices": [ "mean", "median", "knn", "most_frequent" ], "code_uid": "Select.0.40.16.33-randade919ed", "disabled": false, "hidden": false, "label": "Imputation Methods", "model_id": "9367b36a66f148d2acd012c4f1c4fd71", "url_key": "", "value": "median", "widget": "Select" }, "application/vnd.jupyter.widget-view+json": { "model_id": "9367b36a66f148d2acd012c4f1c4fd71", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/mercury+json": { "choices": [ "none", "lasso", "rfe", "pca", "boruta" ], "code_uid": "Select.0.40.16.38-randff2c0505", "disabled": false, "hidden": false, "label": "Feature Selection", "model_id": "d9e985281de24f87a5e3d20d79348999", "url_key": "", "value": "none", "widget": "Select" }, "application/vnd.jupyter.widget-view+json": { "model_id": "d9e985281de24f87a5e3d20d79348999", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/mercury+json": { "choices": [ "none", "smote", "undersampling", "rose" ], "code_uid": "Select.0.40.16.42-rand81ad2a30", "disabled": false, "hidden": false, "label": "Imbalance Treatment", "model_id": "368fdc5c1dfa46029723962befc161a1", "url_key": "", "value": "none", "widget": "Select" }, "application/vnd.jupyter.widget-view+json": { "model_id": "368fdc5c1dfa46029723962befc161a1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/mercury+json": { "choices": [ "random_forest", "logistic_regression", "knn", "svm", "naive_bayes", "decision_tree", "xgboost" ], "code_uid": "Select.0.40.16.46-rand5302564f", "disabled": false, "hidden": false, "label": "Model Selection", "model_id": "ad9082261f9e49b387ff963824152a05", "url_key": "", "value": "random_forest", "widget": "Select" }, "application/vnd.jupyter.widget-view+json": { "model_id": "ad9082261f9e49b387ff963824152a05", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# input train and test sets\n", "input_train_set = X_train\n", "input_test_set = X_test\n", "\n", "# Start widget section\n", "\n", "input_drop_duplicates = 'yes'\n", "input_missing_values_threshold = mr.Text(label=\"Missing Value Threeshold\", value='50')\n", "input_missing_values_threshold = int(input_missing_values_threshold.value)\n", "\n", "input_variance_threshold = mr.Text(label=\"Variance Threshold\", value='0.05') # \n", "input_variance_threshold = float(input_variance_threshold.value)\n", "\n", "input_correlation_threshold = mr.Text(label=\"Correlation Threshold\", value='0.95') # \n", "input_correlation_threshold = float(input_correlation_threshold.value)\n", "\n", "# input outlier removal variables\n", "\n", "input_outlier_action = mr.Select(label=\"Outlier Action\", value='ignore', choices=['ignore', 'remove', 'push']) # 'ignore', 'remove', 'push'\n", "input_outlier_action = str(input_outlier_action.value)\n", "\n", "input_outlier_removal_threshold = mr.Select(label=\"Outlier Action Threshold\", value='none', choices=['none', 3, 4, 5]) # 'none' or zscore from 0 to 100\n", "if input_outlier_removal_threshold.value != 'none':\n", " input_outlier_removal_threshold = int(input_outlier_removal_threshold.value)\n", "elif input_outlier_removal_threshold.value == 'none':\n", " input_outlier_removal_threshold = str(input_outlier_removal_threshold.value)\n", "\n", "# input scaling variables\n", "input_scale_model = mr.Select(label=\"Scaling Variables\", value=\"none\", choices=['none', 'standard', 'minmax', 'robust']) # 'none', 'normal', 'standard', 'minmax', 'robust'\n", "input_scale_model = str(input_scale_model.value)\n", "\n", "# input imputation variables\n", "input_imputation_method = mr.Select(label=\"Imputation Methods\", value=\"median\", choices=['mean', 'median', 'knn', 'most_frequent']) # 'mean', 'median', 'knn', 'most_frequent'\n", "input_n_neighbors = 5 # only for knn imputation\n", "input_imputation_method = str(input_imputation_method.value)\n", "\n", "# import feature selection variables\n", "input_feature_selection = mr.Select(label=\"Feature Selection\", value=\"none\", choices=['none', 'lasso', 'rfe', 'pca', 'boruta']) # 'none', 'lasso', 'rfe', 'pca', 'boruta'\n", "input_feature_selection = str(input_feature_selection.value)\n", "\n", "# input imbalance treatment variables\n", "input_imbalance_treatment = mr.Select(label=\"Imbalance Treatment\", value=\"none\", choices=['none', 'smote', 'undersampling', 'rose']) # 'none', 'smote', 'undersampling', 'rose'\n", "input_imbalance_treatment = str(input_imbalance_treatment.value)\n", "\n", "# input model\n", "input_model = mr.Select(label=\"Model Selection\", value=\"random_forest\", choices=['random_forest', 'logistic_regression', 'knn', 'svm', 'naive_bayes','decision_tree','xgboost'])\n", "input_model = str(input_model.value)\n", "\n", "# remove features using the function list_columns_to_drop\n", "\n", "dropped = columns_to_drop(input_train_set, input_drop_duplicates, input_missing_values_threshold, input_variance_threshold, input_correlation_threshold)\n", "\n", "# drop the columns from the training and testing sets and save the new sets as new variables\n", "\n", "X_train2 = input_train_set.drop(dropped, axis=1)\n", "X_test2 = input_test_set.drop(dropped, axis=1)\n", "\n", "\n", "# remove outliers from train dataset\n", "\n", "X_train_dropped_outliers = outlier_removal(X_train2, input_outlier_action, input_outlier_removal_threshold)\n", "\n", "# scale the training and testing sets\n", "\n", "X_train_scaled = scale_dataframe(input_scale_model, X_train_dropped_outliers, X_train_dropped_outliers)\n", "X_test_scaled = scale_dataframe(input_scale_model, X_train_dropped_outliers, X_test2)\n", "\n", "# impute the missing values in the training and testing sets using the function impute_missing_values\n", "\n", "X_train_imputed = impute_missing_values(input_imputation_method,X_train_scaled, X_train_scaled, input_n_neighbors)\n", "X_test_imputed = impute_missing_values(input_imputation_method,X_train_scaled, X_test_scaled, input_n_neighbors)\n", "\n", "# select features\n", "\n", "X_train_selected, selected_features = feature_selection(input_feature_selection, X_train_imputed, y_train)\n", "\n", "X_train_selected = pd.DataFrame(X_train_selected, columns=selected_features)\n", "X_test_selected = X_test_imputed[selected_features]\n", "\n", "# treat imbalance in the training set using the function oversample\n", "\n", "X_train_res, y_train_res = imbalance_treatment(input_imbalance_treatment, X_train_selected, y_train)\n", "\n", "# train the model using the function train_model and save the predictions as new variables\n", "\n", "y_pred_random_forest = train_model('random_forest', X_train_res, y_train_res, X_test_selected, y_test)\n", "y_pred_logistic_regression = train_model('logistic_regression', X_train_res, y_train_res, X_test_selected, y_test)\n", "y_pred_knn = train_model('knn', X_train_res, y_train_res, X_test_selected, y_test)\n", "y_pred_svm = train_model('svm', X_train_res, y_train_res, X_test_selected, y_test)\n", "y_pred_naive_bayes = train_model('naive_bayes', X_train_res, y_train_res, X_test_selected, y_test)\n", "y_pred_decision_tree = train_model('decision_tree', X_train_res, y_train_res, X_test_selected, y_test)\n", "y_pred_xgboost = train_model('xgboost', X_train_res, y_train_res, X_test_selected, y_test)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "evaluation_score_output, evaluation_counts_output = evaluate_models(input_model)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "#### **Confusion Matrix**" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Accuracy Precision Recall F1-score\n", "0 0.93 0.0 0.0 0.0\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# create a np.array with selected_model values\n", "\n", "conf_matrix = np.array([[evaluation_counts_output['True Negatives'].values[0], evaluation_counts_output['False Positives'].values[0]],\n", " [evaluation_counts_output['False Negatives'].values[0], evaluation_counts_output['True Positives'].values[0]]])\n", "\n", "fig, ax = plot_confusion_matrix(\n", " conf_mat=conf_matrix,\n", " show_absolute=True,\n", " show_normed=True\n", ")\n", "\n", "print(evaluation_score_output[['Accuracy', 'Precision', 'Recall', 'F1-score']])\n", "plt.show()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "### **Transformations Report**" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "FEATURE REMOVAL\n", "('the number of columns dropped due to duplications is: ', 104)\n", "('the number of columns dropped due to missing values is: ', 28)\n", "('the number of columns dropped due to low variance is: ', 189)\n", "('the number of columns dropped due to high correlation is: ', 90)\n", "('Total number of columns to be dropped is: ', 411)\n", "('New shape of the dataframe is: ', (1175, 179))\n", "------------------------------------------\n", "OUTLIER REMOVAL\n", "No z-score threshold was selected\n", "No outliers were removed from the dataset\n", "------------------------------------------\n", "SCALING\n", "The dataframe has not been scaled\n", "------------------------------------------\n", "IMPUTATION\n", "('Number of missing values before imputation: ', 1196)\n", "median imputation has been applied\n", "('Number of missing values after imputation: ', 0)\n", "------------------------------------------\n", "FEATURE SELECTION\n", "No feature selection has been applied\n", "('Shape of the training set after no feature selection: ', (1175, 179))\n", "------------------------------------------\n", "IMBALANCE TREATMENT\n", "('Shape of the training set after no resampling: ', (1175, 179))\n", "Value counts of the target variable after no resampling: \n", "pass/fail\n", "0 1097\n", "1 78\n", "dtype: int64\n" ] } ], "source": [ "print('FEATURE REMOVAL')\n", "print(feature_removal_report1)\n", "print(feature_removal_report2)\n", "print(feature_removal_report3)\n", "print(feature_removal_report4)\n", "print(feature_removal_report5)\n", "print(feature_removal_report6)\n", "print('------------------------------------------')\n", "print('OUTLIER REMOVAL')\n", "print(outlier_removal_report0)\n", "print(outlier_removal_report1)\n", "print('------------------------------------------')\n", "print('SCALING')\n", "print(scaling_report0)\n", "print('------------------------------------------')\n", "print('IMPUTATION')\n", "print(imputation_report0)\n", "print(imputation_report1)\n", "print(imputation_report2)\n", "print('------------------------------------------')\n", "print('FEATURE SELECTION')\n", "print(feature_selection_report0)\n", "print(feature_selection_report1)\n", "print('------------------------------------------')\n", "print('IMBALANCE TREATMENT')\n", "print(imbalance_report0)\n", "print(imbalance_report1)\n", "print(imbalance_report2)" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }