File size: 19,957 Bytes

2880a2f

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "db772bcc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data handling\n",
    "import pandas as pd\n",
    "import numpy as np \n",
    "\n",
    "\n",
    "# EDA (pandas-profiling, etc. )\n",
    "...\n",
    "\n",
    "# Feature Processing (Scikit-learn processing, etc. )\n",
    "from sklearn import preprocessing\n",
    "\n",
    "# Machine Learning (Scikit-learn Estimators, Catboost, LightGBM, etc. )\n",
    "...\n",
    "\n",
    "# Hyperparameters Fine-tuning (Scikit-learn hp search, cross-validation, etc. )\n",
    "...\n",
    "\n",
    "# Other packages\n",
    "import os\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "#display all columns and rows \n",
    "pd.set_option('display.max_columns', None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d80b4220",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Class counts before SMOTE: No     4111\n",
      "Yes    1505\n",
      "Name: Churn, dtype: int64\n",
      "Class counts after SMOTE: Yes    4111\n",
      "No     4111\n",
      "Name: Churn, dtype: int64\n",
      "AdaBoost Classifier: 0.9019360028118717\n",
      "Logistic Regression Classifier: 0.8608679697080713\n",
      "Random Forest Classifier: 0.9311295690912422\n",
      "Gradient Boosting Classifier: 0.9235269779240596\n",
      "SVM Classifier: 0.8944493562575639\n",
      "Best model: Random Forest Classifier\n",
      "AdaBoost Classifier classification report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "          No       0.90      0.76      0.82      1053\n",
      "         Yes       0.50      0.74      0.60       352\n",
      "\n",
      "    accuracy                           0.75      1405\n",
      "   macro avg       0.70      0.75      0.71      1405\n",
      "weighted avg       0.80      0.75      0.77      1405\n",
      "\n",
      "\n",
      "Logistic Regression Classifier classification report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "          No       0.92      0.73      0.81      1053\n",
      "         Yes       0.49      0.80      0.61       352\n",
      "\n",
      "    accuracy                           0.74      1405\n",
      "   macro avg       0.70      0.76      0.71      1405\n",
      "weighted avg       0.81      0.74      0.76      1405\n",
      "\n",
      "\n",
      "Random Forest Classifier classification report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "          No       0.86      0.84      0.85      1053\n",
      "         Yes       0.56      0.61      0.58       352\n",
      "\n",
      "    accuracy                           0.78      1405\n",
      "   macro avg       0.71      0.72      0.72      1405\n",
      "weighted avg       0.79      0.78      0.79      1405\n",
      "\n",
      "\n",
      "Gradient Boosting Classifier classification report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "          No       0.89      0.80      0.84      1053\n",
      "         Yes       0.54      0.69      0.60       352\n",
      "\n",
      "    accuracy                           0.77      1405\n",
      "   macro avg       0.71      0.74      0.72      1405\n",
      "weighted avg       0.80      0.77      0.78      1405\n",
      "\n",
      "\n",
      "SVM Classifier classification report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "          No       0.89      0.77      0.83      1053\n",
      "         Yes       0.52      0.73      0.60       352\n",
      "\n",
      "    accuracy                           0.76      1405\n",
      "   macro avg       0.71      0.75      0.72      1405\n",
      "weighted avg       0.80      0.76      0.77      1405\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# For CSV, use pandas.read_csv\n",
    "\n",
    "df = pd.read_csv(\"Telco-Customer-Churn.csv\")\n",
    "df.drop(['customerID'], axis=1, inplace=True)\n",
    "# Coerce the conversion of TotalCharges column to float\n",
    "df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')\n",
    "# Remove the duplicate rows\n",
    "df = df.drop_duplicates()\n",
    "\n",
    "cols_to_replace = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines']\n",
    "df[cols_to_replace] = df[cols_to_replace].replace('No internet service', 'No').replace('No phone service', 'No')\n",
    "\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# split the data into features (X) and target variable (y)\n",
    "X = df.drop('Churn', axis=1)\n",
    "y = df['Churn']\n",
    "\n",
    "# split the data into train and test sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n",
    "# Identify numeric and non-numeric columns\n",
    "num_cols = X.select_dtypes(include=[np.number]).columns.tolist()\n",
    "cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()\n",
    "\n",
    "\n",
    "'''creating copy of the categorical features and numerical features\n",
    "before imputing null value to avoid modifying the orginal dataset'''\n",
    "\n",
    "X_train_cat = X_train[cat_cols].copy()\n",
    "X_train_num = X_train[num_cols].copy()\n",
    "\n",
    "X_test_cat = X_test[cat_cols].copy()\n",
    "X_test_num = X_test[num_cols].copy()\n",
    "\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "\n",
    "# Creating imputer variables\n",
    "numerical_imputer = SimpleImputer(strategy = \"mean\")\n",
    "categorical_imputer = SimpleImputer(strategy = \"most_frequent\")\n",
    "\n",
    "\n",
    "# Define the column transformer\n",
    "categorical_features = cat_cols\n",
    "categorical_transformer = Pipeline(steps=[\n",
    "    ('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto', sparse=False))\n",
    "])\n",
    "preprocessor = ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('cat', categorical_transformer, categorical_features)\n",
    "    ])\n",
    "\n",
    "# Fitting the Imputer\n",
    "X_train_cat_imputed = categorical_imputer.fit_transform(X_train_cat)\n",
    "X_train_num_imputed = numerical_imputer.fit_transform(X_train_num)\n",
    "\n",
    "X_test_cat_imputed = categorical_imputer.fit_transform(X_test_cat)\n",
    "X_test_num_imputed = numerical_imputer.fit_transform(X_test_num)\n",
    "\n",
    "encoder=OneHotEncoder(handle_unknown='ignore')\n",
    "\n",
    "# encoding the xtrain categories and converting to a dataframe\n",
    "X_train_cat_encoded = encoder.fit(X_train_cat_imputed)\n",
    "X_train_cat_encoded = pd.DataFrame(encoder.transform(X_train_cat_imputed).toarray(),\n",
    "                                   columns=encoder.get_feature_names_out(cat_cols))\n",
    "\n",
    "# encoding the xeval categories and converting to a dataframe\n",
    "X_test_cat_encoded = encoder.fit(X_test_cat_imputed)\n",
    "X_test_cat_encoded = pd.DataFrame(encoder.transform(X_test_cat_imputed).toarray(),\n",
    "                                   columns=encoder.get_feature_names_out(cat_cols))\n",
    "\n",
    "\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "scaler= StandardScaler()\n",
    "\n",
    "X_train_num_scaled = scaler.fit_transform(X_train_num_imputed)\n",
    "X_train_num_sc = pd.DataFrame(X_train_num_scaled, columns = num_cols)\n",
    "\n",
    "X_test_num_scaled = scaler.fit_transform(X_test_num_imputed)\n",
    "X_test_num_sc = pd.DataFrame(X_test_num_scaled, columns = num_cols)\n",
    "\n",
    "X_train_df = pd.concat([X_train_num_sc,X_train_cat_encoded], axis =1)\n",
    "X_test_df = pd.concat([X_test_num_sc,X_test_cat_encoded], axis =1)\n",
    "\n",
    "\n",
    "#Training over SMOTE-balanced data with roc_auc scoring \n",
    "\n",
    "\n",
    "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from imblearn.over_sampling import SMOTE\n",
    "\n",
    "# initialize SMOTE\n",
    "sm = SMOTE(random_state=42)\n",
    "\n",
    "# fit SMOTE on the training data and resample it\n",
    "X_train_resampled, y_train_resampled = sm.fit_resample(X_train_df, y_train)\n",
    "\n",
    "# print class counts before and after SMOTE\n",
    "print(f'Class counts before SMOTE: {y_train.value_counts()}')\n",
    "print(f'Class counts after SMOTE: {y_train_resampled.value_counts()}')\n",
    "\n",
    "# create a dictionary of models to fit\n",
    "models = {\n",
    "    'AdaBoost Classifier': AdaBoostClassifier(),\n",
    "    'Logistic Regression Classifier': LogisticRegression(),\n",
    "    'Random Forest Classifier': RandomForestClassifier(),\n",
    "    'Gradient Boosting Classifier': GradientBoostingClassifier(),\n",
    "    'SVM Classifier': SVC(probability=True)\n",
    "}\n",
    "\n",
    "# iterate over the models and fit each one to the resampled training data\n",
    "for name, model in models.items():\n",
    "    model.fit(X_train_resampled, y_train_resampled)\n",
    "    \n",
    "# evaluate each model using cross-validation based on ROC-AUC\n",
    "roc_auc_scores = {}\n",
    "for name, model in models.items():\n",
    "    scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='roc_auc')\n",
    "    roc_auc_scores[name] = scores.mean()\n",
    "    \n",
    "# print the ROC-AUC scores for each model\n",
    "for name, score in roc_auc_scores.items():\n",
    "    print(f'{name}: {score}')\n",
    "\n",
    "# choose the model with the highest ROC-AUC score\n",
    "best_model_name = max(roc_auc_scores, key=roc_auc_scores.get)\n",
    "best_model = models[best_model_name]\n",
    "print(f'Best model: {best_model_name}')\n",
    "\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "# iterate over the models and make predictions on the test data for each one\n",
    "for name, model in models.items():\n",
    "    # fit the model to the resampled training data\n",
    "    model.fit(X_train_resampled, y_train_resampled)\n",
    "    # make predictions on the test data\n",
    "    y_pred = model.predict(X_test_df)\n",
    "    # generate the classification report\n",
    "    report = classification_report(y_test, y_pred)\n",
    "    # print the classification report\n",
    "    print(f'{name} classification report:\\n{report}\\n')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "4aab6799",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['SeniorCitizen',\n",
       " 'tenure',\n",
       " 'MonthlyCharges',\n",
       " 'TotalCharges',\n",
       " 'gender_Female',\n",
       " 'gender_Male',\n",
       " 'Partner_No',\n",
       " 'Partner_Yes',\n",
       " 'Dependents_No',\n",
       " 'Dependents_Yes',\n",
       " 'PhoneService_No',\n",
       " 'PhoneService_Yes',\n",
       " 'MultipleLines_No',\n",
       " 'MultipleLines_Yes',\n",
       " 'InternetService_DSL',\n",
       " 'InternetService_Fiber optic',\n",
       " 'InternetService_No',\n",
       " 'OnlineSecurity_No',\n",
       " 'OnlineSecurity_Yes',\n",
       " 'OnlineBackup_No',\n",
       " 'OnlineBackup_Yes',\n",
       " 'DeviceProtection_No',\n",
       " 'DeviceProtection_Yes',\n",
       " 'TechSupport_No',\n",
       " 'TechSupport_Yes',\n",
       " 'StreamingTV_No',\n",
       " 'StreamingTV_Yes',\n",
       " 'StreamingMovies_No',\n",
       " 'StreamingMovies_Yes',\n",
       " 'Contract_Month-to-month',\n",
       " 'Contract_One year',\n",
       " 'Contract_Two year',\n",
       " 'PaperlessBilling_No',\n",
       " 'PaperlessBilling_Yes',\n",
       " 'PaymentMethod_Bank transfer (automatic)',\n",
       " 'PaymentMethod_Credit card (automatic)',\n",
       " 'PaymentMethod_Electronic check',\n",
       " 'PaymentMethod_Mailed check']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_df.columns.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d53e6b9e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column 'gender' categories: ['Female' 'Male']\n",
      "Column 'SeniorCitizen' categories: [0 1]\n",
      "Column 'Partner' categories: ['Yes' 'No']\n",
      "Column 'Dependents' categories: ['No' 'Yes']\n",
      "Column 'tenure' categories: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27\n",
      "  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68\n",
      " 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0\n",
      " 39]\n",
      "Column 'PhoneService' categories: ['No' 'Yes']\n",
      "Column 'MultipleLines' categories: ['No' 'Yes']\n",
      "Column 'InternetService' categories: ['DSL' 'Fiber optic' 'No']\n",
      "Column 'OnlineSecurity' categories: ['No' 'Yes']\n",
      "Column 'OnlineBackup' categories: ['Yes' 'No']\n",
      "Column 'DeviceProtection' categories: ['No' 'Yes']\n",
      "Column 'TechSupport' categories: ['No' 'Yes']\n",
      "Column 'StreamingTV' categories: ['No' 'Yes']\n",
      "Column 'StreamingMovies' categories: ['No' 'Yes']\n",
      "Column 'Contract' categories: ['Month-to-month' 'One year' 'Two year']\n",
      "Column 'PaperlessBilling' categories: ['Yes' 'No']\n",
      "Column 'PaymentMethod' categories: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'\n",
      " 'Credit card (automatic)']\n",
      "Column 'MonthlyCharges' categories: [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]\n",
      "Column 'TotalCharges' categories: [  29.85 1889.5   108.15 ...  346.45  306.6  6844.5 ]\n"
     ]
    }
   ],
   "source": [
    "for col in X.columns:\n",
    "    print(f\"Column '{col}' categories: {X[col].unique()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b6f7708a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best model: Random Forest Classifier\n"
     ]
    }
   ],
   "source": [
    "best_model_name = 'Random Forest Classifier'\n",
    "\n",
    "best_model = models[best_model_name]\n",
    "\n",
    "print(f'Best model: {best_model_name}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2adb8c7e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "          No       0.85      0.86      0.86      1053\n",
      "         Yes       0.57      0.56      0.56       352\n",
      "\n",
      "    accuracy                           0.78      1405\n",
      "   macro avg       0.71      0.71      0.71      1405\n",
      "weighted avg       0.78      0.78      0.78      1405\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Calculate the class weights\n",
    "class_weight = {\"No\": 1, \"Yes\": 10}\n",
    "\n",
    "# Initialize Logistic Regression model with class weights\n",
    "rf = RandomForestClassifier(class_weight=class_weight)\n",
    "\n",
    "# Fit the model to the training data\n",
    "rf.fit(X_train_resampled, y_train_resampled)\n",
    "\n",
    "# Predict the labels of the test set\n",
    "y_pred = rf.predict(X_test_df)\n",
    "\n",
    "# Generate the classification report\n",
    "report = classification_report(y_test, y_pred)\n",
    "print(report)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3ca066e7",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from joblib import dump\n",
    "import os\n",
    "\n",
    "# set the destination path to the \"export\" directory\n",
    "destination = \".\"\n",
    "\n",
    "# create a dictionary to store the objects and their filenames\n",
    "models = {\"numerical_imputer\": numerical_imputer,\n",
    "          \"categorical_imputer\": categorical_imputer,\n",
    "          \"encoder\": encoder,\n",
    "          \"scaler\": scaler,\n",
    "          \"Final_model\": best_model}\n",
    "\n",
    "# loop through the models and save them using joblib.dump()\n",
    "for name, model in models.items():\n",
    "    dump(model, os.path.join(destination, f\"{name}.joblib\"))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "2185d2f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip freeze > requirements.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8117c959",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO: Successfully saved requirements file in .\\requirements.txt\n"
     ]
    }
   ],
   "source": [
    "!pipreqs . --force"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "33af820b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip list --format=freeze > requirements.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "816b3fe9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "numerical_imputer saved successfully!\n",
      "categorical_imputer saved successfully!\n",
      "encoder saved successfully!\n",
      "scaler saved successfully!\n",
      "Final_model saved successfully!\n"
     ]
    }
   ],
   "source": [
    "for name, model in models.items():\n",
    "    dump(model, os.path.join(destination, f\"{name}.joblib\"))\n",
    "    if os.path.exists(os.path.join(destination, f\"{name}.joblib\")):\n",
    "        print(f\"{name} saved successfully!\")\n",
    "    else:\n",
    "        print(f\"{name} failed to save.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "5143eadb",
   "metadata": {},
   "outputs": [],
   "source": [
    "destination = \".\"\n",
    "numerical_imputer = joblib.load(os.path.join(destination, \"numerical_imputer.joblib\"))\n",
    "categorical_imputer = joblib.load(os.path.join(destination, \"categorical_imputer.joblib\"))\n",
    "encoder = joblib.load(os.path.join(destination, \"encoder.joblib\"))\n",
    "scaler = joblib.load(os.path.join(destination, \"scaler.joblib\"))\n",
    "best_model = joblib.load(os.path.join(destination, \"Final_model.joblib\"))\n",
    "\n",
    "loaded_models = {\"numerical_imputer\": numerical_imputer,\n",
    "                 \"categorical_imputer\": categorical_imputer,\n",
    "                 \"encoder\": encoder,\n",
    "                 \"scaler\": scaler,\n",
    "                 \"Final_model\": best_model}\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}