{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "r02ouRHg8ZmZ"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"source": [
"df = pd.read_csv('/content/diabetes_prediction_dataset.csv')"
],
"metadata": {
"id": "GlmmGECmNK2K"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "NNKuyPZcNQV1",
"outputId": "3f89b7a4-c026-410c-92c4-b065047ebd39"
},
"execution_count": 3,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" gender age hypertension heart_disease smoking_history bmi \\\n",
"0 Female 80.0 0 1 never 25.19 \n",
"1 Female 54.0 0 0 No Info 27.32 \n",
"2 Male 28.0 0 0 never 27.32 \n",
"3 Female 36.0 0 0 current 23.45 \n",
"4 Male 76.0 1 1 current 20.14 \n",
"\n",
" HbA1c_level blood_glucose_level diabetes \n",
"0 6.6 140 0 \n",
"1 6.6 80 0 \n",
"2 5.7 158 0 \n",
"3 5.0 155 0 \n",
"4 4.8 155 0 "
],
"text/html": [
"\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" gender | \n",
" age | \n",
" hypertension | \n",
" heart_disease | \n",
" smoking_history | \n",
" bmi | \n",
" HbA1c_level | \n",
" blood_glucose_level | \n",
" diabetes | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Female | \n",
" 80.0 | \n",
" 0 | \n",
" 1 | \n",
" never | \n",
" 25.19 | \n",
" 6.6 | \n",
" 140 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" Female | \n",
" 54.0 | \n",
" 0 | \n",
" 0 | \n",
" No Info | \n",
" 27.32 | \n",
" 6.6 | \n",
" 80 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" Male | \n",
" 28.0 | \n",
" 0 | \n",
" 0 | \n",
" never | \n",
" 27.32 | \n",
" 5.7 | \n",
" 158 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" Female | \n",
" 36.0 | \n",
" 0 | \n",
" 0 | \n",
" current | \n",
" 23.45 | \n",
" 5.0 | \n",
" 155 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" Male | \n",
" 76.0 | \n",
" 1 | \n",
" 1 | \n",
" current | \n",
" 20.14 | \n",
" 4.8 | \n",
" 155 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df",
"summary": "{\n \"name\": \"df\",\n \"rows\": 100000,\n \"fields\": [\n {\n \"column\": \"gender\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Female\",\n \"Male\",\n \"Other\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22.51683987161513,\n \"min\": 0.08,\n \"max\": 80.0,\n \"num_unique_values\": 102,\n \"samples\": [\n 29.0,\n 39.0,\n 16.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"hypertension\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"heart_disease\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"smoking_history\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"never\",\n \"No Info\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"bmi\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 6.636783416649581,\n \"min\": 10.01,\n \"max\": 95.69,\n \"num_unique_values\": 4247,\n \"samples\": [\n 53.27,\n 32.33\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"HbA1c_level\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0706720918832282,\n \"min\": 3.5,\n \"max\": 9.0,\n \"num_unique_values\": 18,\n \"samples\": [\n 6.6,\n 5.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"blood_glucose_level\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 40,\n \"min\": 80,\n \"max\": 300,\n \"num_unique_values\": 18,\n \"samples\": [\n 140,\n 80\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diabetes\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 3
}
]
},
{
"cell_type": "code",
"source": [
"df.info()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Mz3gCmUwNRgg",
"outputId": "3a18dba4-7aae-4424-9ac3-d88cda778b6b"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"RangeIndex: 100000 entries, 0 to 99999\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 gender 100000 non-null object \n",
" 1 age 100000 non-null float64\n",
" 2 hypertension 100000 non-null int64 \n",
" 3 heart_disease 100000 non-null int64 \n",
" 4 smoking_history 100000 non-null object \n",
" 5 bmi 100000 non-null float64\n",
" 6 HbA1c_level 100000 non-null float64\n",
" 7 blood_glucose_level 100000 non-null int64 \n",
" 8 diabetes 100000 non-null int64 \n",
"dtypes: float64(3), int64(4), object(2)\n",
"memory usage: 6.9+ MB\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"df.drop_duplicates(inplace = True)\n",
"df.shape"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "PGOOQLmk4Sdq",
"outputId": "80d9226f-1abf-4ee4-f9a5-10b031fe79bc"
},
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(96146, 9)"
]
},
"metadata": {},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"source": [
"# features=X.columns.tolist()\n",
"# pd.plotting.scatter_matrix(X[features], figsize=(12, 12))\n",
"# plt.show()"
],
"metadata": {
"id": "ETJDhwxQNjlH"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def cor(df, th):\n",
" corr_mat = df.corr(numeric_only=True)\n",
" corr = set()\n",
"\n",
" for i in range(len(corr_mat.columns)):\n",
" for j in range(i):\n",
" if abs(corr_mat.iloc[i, j]) > th:\n",
" colname = corr_mat.columns[i]\n",
" corr.add(colname)\n",
" return corr\n",
"\n",
"cor(df,0.9)\n",
"\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-Huy54T34SRo",
"outputId": "21c9ab27-2198-4dcb-9944-6954064ed94d"
},
"execution_count": 7,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"set()"
]
},
"metadata": {},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"source": [
"df.duplicated().sum()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ChebluFo4Sbl",
"outputId": "4001ca7f-a39f-4b60-b88f-063958e12799"
},
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0"
]
},
"metadata": {},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"source": [
"df.select_dtypes(include = ['float64','int64']).columns"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ZSCgG10z7AXq",
"outputId": "1b7bb858-e852-4cc4-b342-296d9a54a28c"
},
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Index(['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',\n",
" 'blood_glucose_level', 'diabetes'],\n",
" dtype='object')"
]
},
"metadata": {},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"source": [
"# checking for outliers\n",
"def detect_outlier_iqr(df):\n",
" out = {}\n",
"\n",
" for i in df.select_dtypes(include = ['float64','int64']).columns:\n",
" q1 = df[i].quantile(0.25)\n",
" q3 = df[i].quantile(0.75)\n",
"\n",
" iqr = q3 - q1\n",
" upper_bound = q3 + 1.5 * iqr\n",
" lower_bound = q1 - 1.5 * iqr\n",
"\n",
" # out[i] = df[(df[i] > upper_bound) | (df[i] < lower_bound)].index\n",
" out[i] = df[(df[i] > upper_bound) | (df[i] < lower_bound)][i].count()\n",
"\n",
" return out"
],
"metadata": {
"id": "xtBGtixl5VYZ"
},
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"source": [
"detect_outlier_iqr(df) # planning to use tree based algo , hence leaving the outleirs"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YrZhSaoI7sWZ",
"outputId": "f565065c-045d-4258-ee00-dd4583e62c8e"
},
"execution_count": 11,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'age': 0,\n",
" 'hypertension': 7461,\n",
" 'heart_disease': 3923,\n",
" 'bmi': 5354,\n",
" 'HbA1c_level': 1312,\n",
" 'blood_glucose_level': 2031,\n",
" 'diabetes': 8482}"
]
},
"metadata": {},
"execution_count": 11
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X = df.drop('diabetes', axis=1)\n",
"y = df['diabetes']\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n"
],
"metadata": {
"id": "P87fPcx6ClP1"
},
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"ct = ColumnTransformer(\n",
" transformers=[('encoder', OneHotEncoder(), ['gender', 'smoking_history'])],\n",
" remainder='passthrough' # Keep non-categorical features unchanged\n",
")\n",
"# Fit encoder on the training set and transform both training and test sets\n",
"X_train = ct.fit_transform(X_train)\n",
"X_test = ct.transform(X_test)\n",
"\n",
"# the columns will be changed after encoding\n",
"encoded_columns = ct.transformers_[0][1].get_feature_names_out(['gender', 'smoking_history'])\n",
"\n",
"non_encoded_columns = [col for col in X.columns if col not in ['gender', 'smoking_history']]\n",
"\n",
"columns = list(encoded_columns) + non_encoded_columns\n",
"\n",
"X_train_df = pd.DataFrame(X_train, columns=columns)\n",
"X_test_df = pd.DataFrame(X_test, columns=columns)\n",
"\n",
"X_train_df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 243
},
"id": "y5IMX2wF7xNt",
"outputId": "1a27a9c1-149a-436d-b227-d2d080a7ed80"
},
"execution_count": 13,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" gender_Female gender_Male gender_Other smoking_history_No Info \\\n",
"0 1.0 0.0 0.0 1.0 \n",
"1 1.0 0.0 0.0 1.0 \n",
"2 1.0 0.0 0.0 0.0 \n",
"3 0.0 1.0 0.0 0.0 \n",
"4 0.0 1.0 0.0 0.0 \n",
"\n",
" smoking_history_current smoking_history_ever smoking_history_former \\\n",
"0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 1.0 0.0 \n",
"3 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 \n",
"\n",
" smoking_history_never smoking_history_not current age hypertension \\\n",
"0 0.0 0.0 35.0 0.0 \n",
"1 0.0 0.0 0.4 0.0 \n",
"2 0.0 0.0 60.0 0.0 \n",
"3 1.0 0.0 66.0 0.0 \n",
"4 0.0 1.0 48.0 0.0 \n",
"\n",
" heart_disease bmi HbA1c_level blood_glucose_level \n",
"0 0.0 31.53 6.6 155.0 \n",
"1 0.0 15.19 4.0 130.0 \n",
"2 0.0 20.90 6.5 100.0 \n",
"3 0.0 27.05 6.0 130.0 \n",
"4 0.0 24.05 4.8 145.0 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" gender_Female | \n",
" gender_Male | \n",
" gender_Other | \n",
" smoking_history_No Info | \n",
" smoking_history_current | \n",
" smoking_history_ever | \n",
" smoking_history_former | \n",
" smoking_history_never | \n",
" smoking_history_not current | \n",
" age | \n",
" hypertension | \n",
" heart_disease | \n",
" bmi | \n",
" HbA1c_level | \n",
" blood_glucose_level | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 35.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 31.53 | \n",
" 6.6 | \n",
" 155.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.4 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 15.19 | \n",
" 4.0 | \n",
" 130.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 60.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 20.90 | \n",
" 6.5 | \n",
" 100.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 66.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 27.05 | \n",
" 6.0 | \n",
" 130.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 48.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 24.05 | \n",
" 4.8 | \n",
" 145.0 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "X_train_df",
"summary": "{\n \"name\": \"X_train_df\",\n \"rows\": 76916,\n \"fields\": [\n {\n \"column\": \"gender_Female\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.49278981616535134,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"gender_Male\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.49275404821913577,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"gender_Other\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.014421462123147873,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"smoking_history_No Info\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47468796206505487,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"smoking_history_current\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.29436237193571957,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"smoking_history_ever\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2004279188212293,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"smoking_history_former\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2955547024342698,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"smoking_history_never\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4790868208182658,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"smoking_history_not current\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.24779545740747777,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22.461482386186912,\n \"min\": 0.08,\n \"max\": 80.0,\n \"num_unique_values\": 102,\n \"samples\": [\n 29.0,\n 65.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"hypertension\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2667059473374165,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"heart_disease\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.197157379821037,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"bmi\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 6.770758482966483,\n \"min\": 10.01,\n \"max\": 95.69,\n \"num_unique_values\": 4103,\n \"samples\": [\n 46.0,\n 39.55\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"HbA1c_level\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0731461261964725,\n \"min\": 3.5,\n \"max\": 9.0,\n \"num_unique_values\": 18,\n \"samples\": [\n 6.6,\n 4.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"blood_glucose_level\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 40.92295872881815,\n \"min\": 80.0,\n \"max\": 300.0,\n \"num_unique_values\": 18,\n \"samples\": [\n 155.0,\n 130.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 13
}
]
},
{
"cell_type": "code",
"source": [
"# training with random_forest classifier"
],
"metadata": {
"id": "WSyGSoJ1Hur_"
},
"execution_count": 14,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import classification_report, accuracy_score\n",
"\n",
"rf_model = RandomForestClassifier(random_state=42)\n",
"rf_model.fit(X_train, y_train)\n",
"\n",
"y_pred = rf_model.predict(X_test)\n",
"\n",
"print(f'Accuracy: {accuracy_score(y_test, y_pred)}')\n",
"print(f'Classification Report: \\n{classification_report(y_test, y_pred)}')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "M83wlm6sC91d",
"outputId": "91d25ca1-4c46-4c96-a9c3-e715257c9c83"
},
"execution_count": 15,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Accuracy: 0.9686427457098284\n",
"Classification Report: \n",
" precision recall f1-score support\n",
"\n",
" 0 0.97 1.00 0.98 17509\n",
" 1 0.95 0.69 0.80 1721\n",
"\n",
" accuracy 0.97 19230\n",
" macro avg 0.96 0.84 0.89 19230\n",
"weighted avg 0.97 0.97 0.97 19230\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import xgboost as xgb\n",
"xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='mlogloss')\n",
"\n",
"xgb_model.fit(X_train_df, y_train)\n",
"\n",
"y_pred_xgb = xgb_model.predict(X_test_df)\n",
"\n",
"\n",
"print(f'Accuracy: {accuracy_score(y_test, y_pred_xgb)}')\n",
"print(f'Classification Report: \\n{classification_report(y_test, y_pred_xgb)}')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Zc97ymCIHDF6",
"outputId": "3109ea58-8e14-4be9-cbae-b0d5d6427125"
},
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Accuracy: 0.9702548101924077\n",
"Classification Report: \n",
" precision recall f1-score support\n",
"\n",
" 0 0.97 1.00 0.98 17509\n",
" 1 0.96 0.69 0.81 1721\n",
"\n",
" accuracy 0.97 19230\n",
" macro avg 0.97 0.85 0.90 19230\n",
"weighted avg 0.97 0.97 0.97 19230\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Now lets try out hyperparameter tuning\n",
"\n",
"from sklearn.model_selection import GridSearchCV"
],
"metadata": {
"id": "R5g3kuruC_jQ"
},
"execution_count": 17,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# for random forest\n",
"param_grid_rf = {\n",
" 'n_estimators': [50, 100, 200],\n",
" 'max_depth': [5, 10, 15],\n",
" 'min_samples_split': [2, 5, 10],\n",
" 'min_samples_leaf': [1, 2, 4]\n",
"}\n",
"grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42),\n",
" param_grid=param_grid_rf,\n",
" scoring='accuracy',\n",
" cv=5,\n",
" verbose=2,\n",
" n_jobs=-1)\n",
"\n",
"# grid_search_rf.fit(X_train_df, y_train)\n",
"\n",
"# print(f\"Best Parameters for Random Forest: {grid_search_rf.best_params_}\")\n",
"\n",
"# estm = grid_search_rf.best_estimator_\n",
"# y_pred_rf = estm.predict(X_test_df)\n",
"\n",
"# print(f'Accuracy: {accuracy_score(y_test, y_pred_rf)}')\n",
"# print(f'Classification Report: \\n{classification_report(y_test, y_pred_rf)}')"
],
"metadata": {
"id": "6pt7VTU1IaUS"
},
"execution_count": 18,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import torch\n",
"print(torch.cuda.is_available())"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "A4bdvQLxLO7v",
"outputId": "dd8bd5f0-3fe5-4f12-c9d6-28c7b125801b"
},
"execution_count": 19,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"True\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# !pip uninstall -y scikit-learn xgboost\n",
"# !pip install scikit-learn xgboost\n",
"\n"
],
"metadata": {
"id": "-nyIpy9yNBQy"
},
"execution_count": 20,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!pip uninstall -y scikit-learn\n",
"!pip install scikit-learn==1.3.1\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 322
},
"id": "3GnHbZ1ANTrx",
"outputId": "6b355ac6-a103-42bc-9531-e69a306268e6"
},
"execution_count": 21,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Found existing installation: scikit-learn 1.3.1\n",
"Uninstalling scikit-learn-1.3.1:\n",
" Successfully uninstalled scikit-learn-1.3.1\n",
"Collecting scikit-learn==1.3.1\n",
" Using cached scikit_learn-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n",
"Requirement already satisfied: numpy<2.0,>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.3.1) (1.26.4)\n",
"Requirement already satisfied: scipy>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.3.1) (1.13.1)\n",
"Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.3.1) (1.4.2)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.3.1) (3.5.0)\n",
"Using cached scikit_learn-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)\n",
"Installing collected packages: scikit-learn\n",
"Successfully installed scikit-learn-1.3.1\n"
]
},
{
"output_type": "display_data",
"data": {
"application/vnd.colab-display-data+json": {
"pip_warning": {
"packages": [
"sklearn"
]
},
"id": "da314930a9264314bfb31a015a04cbc0"
}
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"#for xgboost\n",
"\n",
"from sklearn.model_selection import RandomizedSearchCV\n",
"import xgboost as xgb\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"\n",
"param_dist_xgb = {\n",
" 'n_estimators': [50, 100, 200],\n",
" 'max_depth': [3, 6, 10],\n",
" 'learning_rate': [0.01, 0.1, 0.2],\n",
" 'subsample': [0.8, 1.0],\n",
" 'colsample_bytree': [0.8, 1.0]\n",
"}\n",
"\n",
"random_search_xgb = RandomizedSearchCV(estimator=xgb.XGBClassifier(random_state=42,tree_method='gpu_hist',device='cuda',eval_metric='mlogloss'),\n",
" param_distributions=param_dist_xgb,\n",
" scoring='accuracy',\n",
" n_iter=10, # number of random combinations to try\n",
" cv=5,\n",
" verbose=2,\n",
" n_jobs=-1,\n",
" random_state=42)\n",
"\n",
"# Fit the RandomizedSearchCV with training data\n",
"random_search_xgb.fit(X_train_df, y_train)\n",
"\n",
"# Print the best parameters found by RandomizedSearchCV\n",
"print(f\"Best Parameters for XGBoost: {random_search_xgb.best_params_}\")\n",
"\n",
"# Get the best estimator (model) from the random search\n",
"estm_xgb = random_search_xgb.best_estimator_\n",
"\n",
"# Make predictions with the best model\n",
"y_pred_xgb = estm_xgb.predict(X_test_df)\n",
"\n",
"# Print accuracy and classification report\n",
"print(f'Accuracy: {accuracy_score(y_test, y_pred_xgb)}')\n",
"print(f'Classification Report: \\n{classification_report(y_test, y_pred_xgb)}')\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "sOfAnOxyI521",
"outputId": "d09c85d7-e2d3-4d71-c89f-d75202539db3"
},
"execution_count": 22,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Fitting 5 folds for each of 10 candidates, totalling 50 fits\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [15:20:55] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.\n",
"\n",
" E.g. tree_method = \"hist\", device = \"cuda\"\n",
"\n",
" warnings.warn(smsg, UserWarning)\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Best Parameters for XGBoost: {'subsample': 1.0, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 1.0}\n",
"Accuracy: 0.9709828393135725\n",
"Classification Report: \n",
" precision recall f1-score support\n",
"\n",
" 0 0.97 1.00 0.98 17509\n",
" 1 0.99 0.69 0.81 1721\n",
"\n",
" accuracy 0.97 19230\n",
" macro avg 0.98 0.84 0.90 19230\n",
"weighted avg 0.97 0.97 0.97 19230\n",
"\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [15:20:56] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.\n",
"\n",
" E.g. tree_method = \"hist\", device = \"cuda\"\n",
"\n",
" warnings.warn(smsg, UserWarning)\n",
"/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [15:20:56] WARNING: /workspace/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.\n",
"Potential solutions:\n",
"- Use a data structure that matches the device ordinal in the booster.\n",
"- Set the device for booster before call to inplace_predict.\n",
"\n",
"This warning will only be shown once.\n",
"\n",
" warnings.warn(smsg, UserWarning)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"param_grid_xgb = {\n",
" 'n_estimators': [50, 100, 200],\n",
" 'max_depth': [3, 6, 10],\n",
" 'learning_rate': [0.01, 0.1, 0.2],\n",
" 'subsample': [0.8, 1.0],\n",
" 'colsample_bytree': [0.8, 1.0]\n",
"}\n",
"\n",
"grid_search_xgb = GridSearchCV(estimator=xgb.XGBClassifier(random_state=42,tree_method='gpu_hist',device='cuda', eval_metric='mlogloss'),\n",
" param_grid=param_grid_xgb,\n",
" scoring='accuracy',\n",
" cv=5,\n",
" verbose=2,\n",
" n_jobs=-1)\n",
"\n",
"grid_search_xgb.fit(X_train_df, y_train)\n",
"\n",
"print(f\"Best Parameters for XGBoost: {grid_search_xgb.best_params_}\")\n",
"\n",
"estm_xg = grid_search_xgb.best_estimator_\n",
"y_pred_xgb = estm_xg.predict(X_test_df)\n",
"\n",
"print(f'Accuracy: {accuracy_score(y_test, y_pred_xgb)}')\n",
"print(f'Classification Report: \\n{classification_report(y_test, y_pred_xgb)}')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "KN3sQOG_Mkyt",
"outputId": "2f04e0be-2a61-4e55-eb8c-978e1f53bf38"
},
"execution_count": 23,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Fitting 5 folds for each of 108 candidates, totalling 540 fits\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [15:24:48] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.\n",
"\n",
" E.g. tree_method = \"hist\", device = \"cuda\"\n",
"\n",
" warnings.warn(smsg, UserWarning)\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Best Parameters for XGBoost: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}\n",
"Accuracy: 0.971086843473739\n",
"Classification Report: \n",
" precision recall f1-score support\n",
"\n",
" 0 0.97 1.00 0.98 17509\n",
" 1 0.99 0.69 0.81 1721\n",
"\n",
" accuracy 0.97 19230\n",
" macro avg 0.98 0.84 0.90 19230\n",
"weighted avg 0.97 0.97 0.97 19230\n",
"\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [15:24:48] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.\n",
"\n",
" E.g. tree_method = \"hist\", device = \"cuda\"\n",
"\n",
" warnings.warn(smsg, UserWarning)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.metrics import confusion_matrix\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Generate the confusion matrix\n",
"cm = confusion_matrix(y_test, y_pred_xgb)\n",
"\n",
"# Plot the confusion matrix\n",
"sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')\n",
"plt.ylabel('True Label')\n",
"plt.xlabel('Predicted Label')\n",
"plt.title('Confusion Matrix')\n",
"plt.show()"
],
"metadata": {
"id": "ZCnpcc7qVGLx",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 472
},
"outputId": "4b43f631-8251-4e5c-dd95-5b122b85f532"
},
"execution_count": 24,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"# # Plot feature importance\n",
"# xgb.plot_importance(y_pred_xgb)\n",
"# plt.show()\n",
"xgb.plot_importance(estm_xg)\n",
"plt.show()\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 472
},
"id": "wLkhIeV00Gb8",
"outputId": "53a79897-1c43-478c-d8bf-3631538bb2b0"
},
"execution_count": 28,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"import joblib"
],
"metadata": {
"id": "oLthUqWZ03hx"
},
"execution_count": 29,
"outputs": []
},
{
"cell_type": "code",
"source": [
"joblib.dump(estm_xg, 'xgb_model.pkl')\n",
"# To load the model later\n",
"# loaded_model = joblib.load('xgboost_model.pkl')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "uNgh-JRx1F7N",
"outputId": "2d656360-1689-41cf-8fa2-5cc570f13e34"
},
"execution_count": 30,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['xgb_model.pkl']"
]
},
"metadata": {},
"execution_count": 30
}
]
},
{
"cell_type": "code",
"source": [
"import pickle\n",
"\n",
"# Save the model to a file using pickle\n",
"with open('xgboost_model.pkl', 'wb') as file:\n",
" pickle.dump(estm_xg, file)\n"
],
"metadata": {
"id": "EUR23YUTIIIs"
},
"execution_count": 31,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import pickle\n",
"\n",
"# Load the model from the file\n",
"# with open('xgboost_model.pkl', 'rb') as file:\n",
"# loaded_model = pickle.load(file)\n",
"\n",
"# You can now use the loaded_model for predictions\n",
"# y_pred = loaded_model.predict(X_test_df)"
],
"metadata": {
"id": "RFxEdU8zMhnp"
},
"execution_count": 32,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "rU1-1ApYMyg2"
},
"execution_count": null,
"outputs": []
}
]
}