{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# 1. Load the Dataset\n",
"\n",
"The dataset you loaded has three classes of Iris species (setosa, versicolor, virginica) and four features (sepal length, sepal width, petal length, petal width). These features can predict the species."
],
"metadata": {
"id": "ZRclqpDjvjI5"
}
},
{
"cell_type": "code",
"source": [
"!pip install datasets"
],
"metadata": {
"collapsed": true,
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "jy7afuhZv2cm",
"outputId": "cf268477-3114-4976-851d-08236709d46f"
},
"execution_count": 76,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (3.1.0)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.16.1)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.4)\n",
"Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n",
"Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
"Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.32.3)\n",
"Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.6)\n",
"Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.5.0)\n",
"Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n",
"Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.9.0)\n",
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.11.2)\n",
"Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.26.2)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.2)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.2)\n",
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n",
"Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (0.2.0)\n",
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.17.2)\n",
"Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.23.0->datasets) (4.12.2)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.4.0)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.10)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.2.3)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2024.8.30)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n"
]
}
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"id": "g6I-DZGcvb-h"
},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"\n",
"ds = load_dataset(\"scikit-learn/iris\")"
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"\n",
"data = pd.DataFrame(ds['train'])\n",
"data.drop\n",
"data.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 309
},
"id": "F_LhJ80b2lPl",
"outputId": "5edb0950-e70b-4a8b-b00a-f62ba09fda7b"
},
"execution_count": 78,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
"0 1 5.1 3.5 1.4 0.2 Iris-setosa\n",
"1 2 4.9 3.0 1.4 0.2 Iris-setosa\n",
"2 3 4.7 3.2 1.3 0.2 Iris-setosa\n",
"3 4 4.6 3.1 1.5 0.2 Iris-setosa\n",
"4 5 5.0 3.6 1.4 0.2 Iris-setosa"
],
"text/html": [
"\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Id \n",
" SepalLengthCm \n",
" SepalWidthCm \n",
" PetalLengthCm \n",
" PetalWidthCm \n",
" Species \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" 5.1 \n",
" 3.5 \n",
" 1.4 \n",
" 0.2 \n",
" Iris-setosa \n",
" \n",
" \n",
" 1 \n",
" 2 \n",
" 4.9 \n",
" 3.0 \n",
" 1.4 \n",
" 0.2 \n",
" Iris-setosa \n",
" \n",
" \n",
" 2 \n",
" 3 \n",
" 4.7 \n",
" 3.2 \n",
" 1.3 \n",
" 0.2 \n",
" Iris-setosa \n",
" \n",
" \n",
" 3 \n",
" 4 \n",
" 4.6 \n",
" 3.1 \n",
" 1.5 \n",
" 0.2 \n",
" Iris-setosa \n",
" \n",
" \n",
" 4 \n",
" 5 \n",
" 5.0 \n",
" 3.6 \n",
" 1.4 \n",
" 0.2 \n",
" Iris-setosa \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "data",
"summary": "{\n \"name\": \"data\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"Id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 43,\n \"min\": 1,\n \"max\": 150,\n \"num_unique_values\": 150,\n \"samples\": [\n 74,\n 19,\n 119\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.8280661279778629,\n \"min\": 4.3,\n \"max\": 7.9,\n \"num_unique_values\": 35,\n \"samples\": [\n 6.2,\n 4.5,\n 5.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4335943113621737,\n \"min\": 2.0,\n \"max\": 4.4,\n \"num_unique_values\": 23,\n \"samples\": [\n 2.3,\n 4.0,\n 3.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.7644204199522617,\n \"min\": 1.0,\n \"max\": 6.9,\n \"num_unique_values\": 43,\n \"samples\": [\n 6.7,\n 3.8,\n 3.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.7631607417008414,\n \"min\": 0.1,\n \"max\": 2.5,\n \"num_unique_values\": 22,\n \"samples\": [\n 0.2,\n 1.2,\n 1.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Iris-setosa\",\n \"Iris-versicolor\",\n \"Iris-virginica\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 78
}
]
},
{
"cell_type": "markdown",
"source": [
"# 2. Preprocess the Data\n",
"We need to split the data into training and testing sets for evaluation. We’ll also normalize the data to improve model performance.\n"
],
"metadata": {
"id": "XOfQg2PEv8Y4"
}
},
{
"cell_type": "code",
"source": [
"data.info()"
],
"metadata": {
"id": "MYfcugmwv_Ip",
"colab": {
"base_uri": "https://localhost:8080/"
},
"collapsed": true,
"outputId": "92bbd9d4-ee64-4da6-84a8-74b385cabcb6"
},
"execution_count": 79,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"RangeIndex: 150 entries, 0 to 149\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Id 150 non-null int64 \n",
" 1 SepalLengthCm 150 non-null float64\n",
" 2 SepalWidthCm 150 non-null float64\n",
" 3 PetalLengthCm 150 non-null float64\n",
" 4 PetalWidthCm 150 non-null float64\n",
" 5 Species 150 non-null object \n",
"dtypes: float64(4), int64(1), object(1)\n",
"memory usage: 7.2+ KB\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"data.drop('Id', axis=1, inplace=True)"
],
"metadata": {
"id": "0fBxtkxa3Rf3"
},
"execution_count": 80,
"outputs": []
},
{
"cell_type": "code",
"source": [
"data.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 309
},
"collapsed": true,
"id": "RHb_Ysqq3VpO",
"outputId": "45a9f1a9-7d8a-439a-dcf1-3c2733bc7437"
},
"execution_count": 81,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
"0 5.1 3.5 1.4 0.2 Iris-setosa\n",
"1 4.9 3.0 1.4 0.2 Iris-setosa\n",
"2 4.7 3.2 1.3 0.2 Iris-setosa\n",
"3 4.6 3.1 1.5 0.2 Iris-setosa\n",
"4 5.0 3.6 1.4 0.2 Iris-setosa"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" SepalLengthCm \n",
" SepalWidthCm \n",
" PetalLengthCm \n",
" PetalWidthCm \n",
" Species \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 5.1 \n",
" 3.5 \n",
" 1.4 \n",
" 0.2 \n",
" Iris-setosa \n",
" \n",
" \n",
" 1 \n",
" 4.9 \n",
" 3.0 \n",
" 1.4 \n",
" 0.2 \n",
" Iris-setosa \n",
" \n",
" \n",
" 2 \n",
" 4.7 \n",
" 3.2 \n",
" 1.3 \n",
" 0.2 \n",
" Iris-setosa \n",
" \n",
" \n",
" 3 \n",
" 4.6 \n",
" 3.1 \n",
" 1.5 \n",
" 0.2 \n",
" Iris-setosa \n",
" \n",
" \n",
" 4 \n",
" 5.0 \n",
" 3.6 \n",
" 1.4 \n",
" 0.2 \n",
" Iris-setosa \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "data",
"summary": "{\n \"name\": \"data\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.8280661279778629,\n \"min\": 4.3,\n \"max\": 7.9,\n \"num_unique_values\": 35,\n \"samples\": [\n 6.2,\n 4.5,\n 5.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4335943113621737,\n \"min\": 2.0,\n \"max\": 4.4,\n \"num_unique_values\": 23,\n \"samples\": [\n 2.3,\n 4.0,\n 3.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.7644204199522617,\n \"min\": 1.0,\n \"max\": 6.9,\n \"num_unique_values\": 43,\n \"samples\": [\n 6.7,\n 3.8,\n 3.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.7631607417008414,\n \"min\": 0.1,\n \"max\": 2.5,\n \"num_unique_values\": 22,\n \"samples\": [\n 0.2,\n 1.2,\n 1.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Iris-setosa\",\n \"Iris-versicolor\",\n \"Iris-virginica\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 81
}
]
},
{
"cell_type": "markdown",
"source": [
"refs: [Data science | Data Pre-processing using Scikit-learn| Iris dataset| Jay Patel@medium](https://jay190301.medium.com/data-science-data-pre-processing-using-scikit-learn-iris-dataset-1ba0a9ae04e6)"
],
"metadata": {
"id": "LJPk_k_3wnV4"
}
},
{
"cell_type": "markdown",
"source": [
"## Data Encoding\n",
"\n",
"1. label encoding"
],
"metadata": {
"id": "pECEH0dJw8bm"
}
},
{
"cell_type": "code",
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"le = LabelEncoder()\n",
"data['Species'] = le.fit_transform(data['Species'])\n",
"data['Species'].value_counts()"
],
"metadata": {
"id": "UhgqOteCvhCJ",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 244
},
"outputId": "6a8b0932-0d41-4a4b-e3f9-09ea5a7115fc"
},
"execution_count": 82,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Species\n",
"0 50\n",
"1 50\n",
"2 50\n",
"Name: count, dtype: int64"
],
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" count \n",
" \n",
" \n",
" Species \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 50 \n",
" \n",
" \n",
" 1 \n",
" 50 \n",
" \n",
" \n",
" 2 \n",
" 50 \n",
" \n",
" \n",
"
\n",
"
dtype: int64 "
]
},
"metadata": {},
"execution_count": 82
}
]
},
{
"cell_type": "code",
"source": [
"le.classes_"
],
"metadata": {
"id": "TUZBxAjRv--O",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "3ce3dd07-5bb7-4f32-e75a-de81881b7a8a"
},
"execution_count": 83,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)"
]
},
"metadata": {},
"execution_count": 83
}
]
},
{
"cell_type": "markdown",
"source": [
"2. Onehot encoder"
],
"metadata": {
"id": "09l_XnFSxwfX"
}
},
{
"cell_type": "code",
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"ohe = OneHotEncoder()\n",
"transformed_data = ohe.fit_transform(data['Species'].values.reshape(-1,1)).toarray()"
],
"metadata": {
"id": "MF2ssgzqxqYF"
},
"execution_count": 84,
"outputs": []
},
{
"cell_type": "code",
"source": [
"ohe.categories_"
],
"metadata": {
"id": "nQcrvyC6x8Ht",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "b1244732-061f-4c86-acf2-5cd24a7a111e"
},
"execution_count": 85,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[array([0, 1, 2])]"
]
},
"metadata": {},
"execution_count": 85
}
]
},
{
"cell_type": "code",
"source": [
"transformed_data = pd.DataFrame(\n",
" transformed_data,\n",
" columns=['setosa', 'versicolor', 'virginica'],\n",
" index=data.index\n",
")\n",
"transformed_data.head()"
],
"metadata": {
"id": "JHaxnvPEx_x9",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 224
},
"outputId": "06e1a530-04d3-44fa-e35c-0c2c4e07606a"
},
"execution_count": 86,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" setosa versicolor virginica\n",
"0 1.0 0.0 0.0\n",
"1 1.0 0.0 0.0\n",
"2 1.0 0.0 0.0\n",
"3 1.0 0.0 0.0\n",
"4 1.0 0.0 0.0"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" setosa \n",
" versicolor \n",
" virginica \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 1 \n",
" 1.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 2 \n",
" 1.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 3 \n",
" 1.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 4 \n",
" 1.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "transformed_data",
"summary": "{\n \"name\": \"transformed_data\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"setosa\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47298376984040197,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"versicolor\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47298376984040197,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"virginica\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4729837698404015,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 86
}
]
},
{
"cell_type": "markdown",
"source": [
"## Normalization\n",
"\n",
"$$\n",
"x' = \\frac{x - \\text{min}(x)}{\\text{max}(x) - \\text{min}(x)}\n",
"$$"
],
"metadata": {
"id": "vEvDpT7ZycYx"
}
},
{
"cell_type": "code",
"source": [
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
"\n",
"mms = MinMaxScaler(feature_range=(0,1))\n",
"normalized_data = mms.fit_transform(data)\n",
"pd.DataFrame(\n",
" normalized_data,\n",
" columns=data.columns,\n",
")\n"
],
"metadata": {
"id": "Jbrfax_ryUqB",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 439
},
"outputId": "a399d0ce-d4c3-4ad3-fa6f-6b49e1d444a5"
},
"execution_count": 87,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
"0 0.222222 0.625000 0.067797 0.041667 0.0\n",
"1 0.166667 0.416667 0.067797 0.041667 0.0\n",
"2 0.111111 0.500000 0.050847 0.041667 0.0\n",
"3 0.083333 0.458333 0.084746 0.041667 0.0\n",
"4 0.194444 0.666667 0.067797 0.041667 0.0\n",
".. ... ... ... ... ...\n",
"145 0.666667 0.416667 0.711864 0.916667 1.0\n",
"146 0.555556 0.208333 0.677966 0.750000 1.0\n",
"147 0.611111 0.416667 0.711864 0.791667 1.0\n",
"148 0.527778 0.583333 0.745763 0.916667 1.0\n",
"149 0.444444 0.416667 0.694915 0.708333 1.0\n",
"\n",
"[150 rows x 5 columns]"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" SepalLengthCm \n",
" SepalWidthCm \n",
" PetalLengthCm \n",
" PetalWidthCm \n",
" Species \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0.222222 \n",
" 0.625000 \n",
" 0.067797 \n",
" 0.041667 \n",
" 0.0 \n",
" \n",
" \n",
" 1 \n",
" 0.166667 \n",
" 0.416667 \n",
" 0.067797 \n",
" 0.041667 \n",
" 0.0 \n",
" \n",
" \n",
" 2 \n",
" 0.111111 \n",
" 0.500000 \n",
" 0.050847 \n",
" 0.041667 \n",
" 0.0 \n",
" \n",
" \n",
" 3 \n",
" 0.083333 \n",
" 0.458333 \n",
" 0.084746 \n",
" 0.041667 \n",
" 0.0 \n",
" \n",
" \n",
" 4 \n",
" 0.194444 \n",
" 0.666667 \n",
" 0.067797 \n",
" 0.041667 \n",
" 0.0 \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 145 \n",
" 0.666667 \n",
" 0.416667 \n",
" 0.711864 \n",
" 0.916667 \n",
" 1.0 \n",
" \n",
" \n",
" 146 \n",
" 0.555556 \n",
" 0.208333 \n",
" 0.677966 \n",
" 0.750000 \n",
" 1.0 \n",
" \n",
" \n",
" 147 \n",
" 0.611111 \n",
" 0.416667 \n",
" 0.711864 \n",
" 0.791667 \n",
" 1.0 \n",
" \n",
" \n",
" 148 \n",
" 0.527778 \n",
" 0.583333 \n",
" 0.745763 \n",
" 0.916667 \n",
" 1.0 \n",
" \n",
" \n",
" 149 \n",
" 0.444444 \n",
" 0.416667 \n",
" 0.694915 \n",
" 0.708333 \n",
" 1.0 \n",
" \n",
" \n",
"
\n",
"
150 rows × 5 columns
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.23001836888273966,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 35,\n \"samples\": [\n 0.5277777777777779,\n 0.05555555555555558,\n 0.36111111111111094\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.18066429640090576,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 23,\n \"samples\": [\n 0.12499999999999989,\n 0.8333333333333333,\n 0.625\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.29905430846648523,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 43,\n \"samples\": [\n 0.9661016949152543,\n 0.47457627118644063,\n 0.4576271186440678\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3179836423753504,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 22,\n \"samples\": [\n 0.04166666666666667,\n 0.4583333333333333,\n 0.5000000000000001\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.40961596025952024,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.0,\n 0.5,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 87
}
]
},
{
"cell_type": "markdown",
"source": [
"## Standardization\n",
"\n",
"$$\n",
"z = \\frac{x - \\mu}{\\sigma}\n",
"$$"
],
"metadata": {
"id": "lSgmybXl196Y"
}
},
{
"cell_type": "code",
"source": [
"standard_scaler = StandardScaler()\n",
"standardized_data = standard_scaler.fit_transform(data)\n",
"pd.DataFrame(\n",
" standardized_data,\n",
" columns=data.columns,\n",
")"
],
"metadata": {
"id": "JeftVnOz0cAR",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 439
},
"outputId": "8a3d1c62-89ca-4d01-90e2-c7b0292c2855"
},
"execution_count": 88,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
"0 -0.900681 1.032057 -1.341272 -1.312977 -1.224745\n",
"1 -1.143017 -0.124958 -1.341272 -1.312977 -1.224745\n",
"2 -1.385353 0.337848 -1.398138 -1.312977 -1.224745\n",
"3 -1.506521 0.106445 -1.284407 -1.312977 -1.224745\n",
"4 -1.021849 1.263460 -1.341272 -1.312977 -1.224745\n",
".. ... ... ... ... ...\n",
"145 1.038005 -0.124958 0.819624 1.447956 1.224745\n",
"146 0.553333 -1.281972 0.705893 0.922064 1.224745\n",
"147 0.795669 -0.124958 0.819624 1.053537 1.224745\n",
"148 0.432165 0.800654 0.933356 1.447956 1.224745\n",
"149 0.068662 -0.124958 0.762759 0.790591 1.224745\n",
"\n",
"[150 rows x 5 columns]"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" SepalLengthCm \n",
" SepalWidthCm \n",
" PetalLengthCm \n",
" PetalWidthCm \n",
" Species \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" -0.900681 \n",
" 1.032057 \n",
" -1.341272 \n",
" -1.312977 \n",
" -1.224745 \n",
" \n",
" \n",
" 1 \n",
" -1.143017 \n",
" -0.124958 \n",
" -1.341272 \n",
" -1.312977 \n",
" -1.224745 \n",
" \n",
" \n",
" 2 \n",
" -1.385353 \n",
" 0.337848 \n",
" -1.398138 \n",
" -1.312977 \n",
" -1.224745 \n",
" \n",
" \n",
" 3 \n",
" -1.506521 \n",
" 0.106445 \n",
" -1.284407 \n",
" -1.312977 \n",
" -1.224745 \n",
" \n",
" \n",
" 4 \n",
" -1.021849 \n",
" 1.263460 \n",
" -1.341272 \n",
" -1.312977 \n",
" -1.224745 \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 145 \n",
" 1.038005 \n",
" -0.124958 \n",
" 0.819624 \n",
" 1.447956 \n",
" 1.224745 \n",
" \n",
" \n",
" 146 \n",
" 0.553333 \n",
" -1.281972 \n",
" 0.705893 \n",
" 0.922064 \n",
" 1.224745 \n",
" \n",
" \n",
" 147 \n",
" 0.795669 \n",
" -0.124958 \n",
" 0.819624 \n",
" 1.053537 \n",
" 1.224745 \n",
" \n",
" \n",
" 148 \n",
" 0.432165 \n",
" 0.800654 \n",
" 0.933356 \n",
" 1.447956 \n",
" 1.224745 \n",
" \n",
" \n",
" 149 \n",
" 0.068662 \n",
" -0.124958 \n",
" 0.762759 \n",
" 0.790591 \n",
" 1.224745 \n",
" \n",
" \n",
"
\n",
"
150 rows × 5 columns
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359767,\n \"min\": -1.870024133847019,\n \"max\": 2.4920192021244283,\n \"num_unique_values\": 35,\n \"samples\": [\n 0.432165404582356,\n -1.6276883929597161,\n -0.29484181807955234\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359767,\n \"min\": -2.438987252491841,\n \"max\": 3.1146839106774356,\n \"num_unique_values\": 23,\n \"samples\": [\n -1.7447783570956819,\n 2.1890720501492225,\n 1.0320572244889565\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359765,\n \"min\": -1.5687352207168408,\n \"max\": 1.7863413146490472,\n \"num_unique_values\": 43,\n \"samples\": [\n 1.6726099066705424,\n 0.02350449098222449,\n -0.03336121300702764\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359767,\n \"min\": -1.4444496972795189,\n \"max\": 1.7109015831854495,\n \"num_unique_values\": 22,\n \"samples\": [\n -1.3129767272601454,\n 0.001752972933591456,\n 0.13322594295296525\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359767,\n \"min\": -1.224744871391589,\n \"max\": 1.224744871391589,\n \"num_unique_values\": 3,\n \"samples\": [\n -1.224744871391589,\n 0.0,\n 1.224744871391589\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 88
}
]
},
{
"cell_type": "markdown",
"source": [
"## Imputation of missing values\n"
],
"metadata": {
"id": "jhRacCGx4hHD"
}
},
{
"cell_type": "code",
"source": [
"from sklearn.impute import SimpleImputer\n",
"import numpy as np\n",
"\n",
"imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
"imputed_data = imputer.fit_transform(data)\n",
"pd.DataFrame(\n",
" imputed_data,\n",
").isnull().sum()"
],
"metadata": {
"id": "3qqVM1ng2PsJ",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 275
},
"outputId": "af35c600-d995-4458-a5e6-a88e359526dd"
},
"execution_count": 89,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0 0\n",
"1 0\n",
"2 0\n",
"3 0\n",
"4 0\n",
"dtype: int64"
],
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" 0 \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" 1 \n",
" 0 \n",
" \n",
" \n",
" 2 \n",
" 0 \n",
" \n",
" \n",
" 3 \n",
" 0 \n",
" \n",
" \n",
" 4 \n",
" 0 \n",
" \n",
" \n",
"
\n",
"
dtype: int64 "
]
},
"metadata": {},
"execution_count": 89
}
]
},
{
"cell_type": "markdown",
"source": [
"## Discretization"
],
"metadata": {
"id": "i76NX2Ev463h"
}
},
{
"cell_type": "markdown",
"source": [
"1. Quantile Discretization Transform"
],
"metadata": {
"id": "VMYOkM_L48kl"
}
},
{
"cell_type": "code",
"source": [
"from sklearn.preprocessing import KBinsDiscretizer\n",
"\n",
"trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')\n",
"new_data = trans.fit_transform(data)\n",
"pd.DataFrame(\n",
" new_data,\n",
" columns=data.columns\n",
")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 541
},
"id": "gOVi2uYk4qcu",
"outputId": "7b51e4c9-9c89-42ac-bb78-465a7032d4bf"
},
"execution_count": 90,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 1 are removed. Consider decreasing the number of bins.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 3 are removed. Consider decreasing the number of bins.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 4 are removed. Consider decreasing the number of bins.\n",
" warnings.warn(\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
" SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
"0 2.0 7.0 1.0 1.0 0.0\n",
"1 1.0 4.0 1.0 1.0 0.0\n",
"2 0.0 6.0 0.0 1.0 0.0\n",
"3 0.0 5.0 2.0 1.0 0.0\n",
"4 2.0 7.0 1.0 1.0 0.0\n",
".. ... ... ... ... ...\n",
"145 8.0 4.0 7.0 8.0 1.0\n",
"146 7.0 1.0 7.0 7.0 1.0\n",
"147 7.0 4.0 7.0 7.0 1.0\n",
"148 6.0 7.0 8.0 8.0 1.0\n",
"149 5.0 4.0 7.0 6.0 1.0\n",
"\n",
"[150 rows x 5 columns]"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" SepalLengthCm \n",
" SepalWidthCm \n",
" PetalLengthCm \n",
" PetalWidthCm \n",
" Species \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 2.0 \n",
" 7.0 \n",
" 1.0 \n",
" 1.0 \n",
" 0.0 \n",
" \n",
" \n",
" 1 \n",
" 1.0 \n",
" 4.0 \n",
" 1.0 \n",
" 1.0 \n",
" 0.0 \n",
" \n",
" \n",
" 2 \n",
" 0.0 \n",
" 6.0 \n",
" 0.0 \n",
" 1.0 \n",
" 0.0 \n",
" \n",
" \n",
" 3 \n",
" 0.0 \n",
" 5.0 \n",
" 2.0 \n",
" 1.0 \n",
" 0.0 \n",
" \n",
" \n",
" 4 \n",
" 2.0 \n",
" 7.0 \n",
" 1.0 \n",
" 1.0 \n",
" 0.0 \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 145 \n",
" 8.0 \n",
" 4.0 \n",
" 7.0 \n",
" 8.0 \n",
" 1.0 \n",
" \n",
" \n",
" 146 \n",
" 7.0 \n",
" 1.0 \n",
" 7.0 \n",
" 7.0 \n",
" 1.0 \n",
" \n",
" \n",
" 147 \n",
" 7.0 \n",
" 4.0 \n",
" 7.0 \n",
" 7.0 \n",
" 1.0 \n",
" \n",
" \n",
" 148 \n",
" 6.0 \n",
" 7.0 \n",
" 8.0 \n",
" 8.0 \n",
" 1.0 \n",
" \n",
" \n",
" 149 \n",
" 5.0 \n",
" 4.0 \n",
" 7.0 \n",
" 6.0 \n",
" 1.0 \n",
" \n",
" \n",
"
\n",
"
150 rows × 5 columns
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.830395898032167,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 8.0,\n 1.0,\n 4.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.3949611533748287,\n \"min\": 0.0,\n \"max\": 8.0,\n \"num_unique_values\": 9,\n \"samples\": [\n 2.0,\n 4.0,\n 3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.7971621924901315,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 0.0,\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.5674698763560073,\n \"min\": 0.0,\n \"max\": 8.0,\n \"num_unique_values\": 9,\n \"samples\": [\n 8.0,\n 2.0,\n 6.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47298376984040197,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 90
}
]
},
{
"cell_type": "markdown",
"source": [
"2. Uniform Discretization Transform"
],
"metadata": {
"id": "vtHUC3pT5OTS"
}
},
{
"cell_type": "code",
"source": [
"trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')\n",
"new_data = trans.fit_transform(data)\n",
"pd.DataFrame(\n",
" new_data,\n",
" columns=data.columns\n",
")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 439
},
"id": "XZaM1ewV5HjE",
"outputId": "874d7838-627a-4705-dc2f-084dff68bdd5"
},
"execution_count": 91,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
"0 2.0 6.0 0.0 0.0 0.0\n",
"1 1.0 4.0 0.0 0.0 0.0\n",
"2 1.0 5.0 0.0 0.0 0.0\n",
"3 0.0 4.0 0.0 0.0 0.0\n",
"4 1.0 6.0 0.0 0.0 0.0\n",
".. ... ... ... ... ...\n",
"145 6.0 4.0 7.0 9.0 9.0\n",
"146 5.0 2.0 6.0 7.0 9.0\n",
"147 6.0 4.0 7.0 7.0 9.0\n",
"148 5.0 5.0 7.0 9.0 9.0\n",
"149 4.0 4.0 6.0 7.0 9.0\n",
"\n",
"[150 rows x 5 columns]"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" SepalLengthCm \n",
" SepalWidthCm \n",
" PetalLengthCm \n",
" PetalWidthCm \n",
" Species \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 2.0 \n",
" 6.0 \n",
" 0.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 1 \n",
" 1.0 \n",
" 4.0 \n",
" 0.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 2 \n",
" 1.0 \n",
" 5.0 \n",
" 0.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 3 \n",
" 0.0 \n",
" 4.0 \n",
" 0.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 4 \n",
" 1.0 \n",
" 6.0 \n",
" 0.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 145 \n",
" 6.0 \n",
" 4.0 \n",
" 7.0 \n",
" 9.0 \n",
" 9.0 \n",
" \n",
" \n",
" 146 \n",
" 5.0 \n",
" 2.0 \n",
" 6.0 \n",
" 7.0 \n",
" 9.0 \n",
" \n",
" \n",
" 147 \n",
" 6.0 \n",
" 4.0 \n",
" 7.0 \n",
" 7.0 \n",
" 9.0 \n",
" \n",
" \n",
" 148 \n",
" 5.0 \n",
" 5.0 \n",
" 7.0 \n",
" 9.0 \n",
" 9.0 \n",
" \n",
" \n",
" 149 \n",
" 4.0 \n",
" 4.0 \n",
" 6.0 \n",
" 7.0 \n",
" 9.0 \n",
" \n",
" \n",
"
\n",
"
150 rows × 5 columns
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.3315749206787793,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 1.0,\n 7.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.8075778842435182,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 2.0,\n 4.0,\n 8.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.9708577087647687,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 9,\n \"samples\": [\n 7.0,\n 1.0,\n 4.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.1659613014009933,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 1.0,\n 3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.6941213351051103,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.0,\n 5.0,\n 9.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 91
}
]
},
{
"cell_type": "markdown",
"source": [
"3. KMeans Discretization Transform"
],
"metadata": {
"id": "uMFedD8J5V0U"
}
},
{
"cell_type": "code",
"source": [
"trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans')\n",
"new_data = trans.fit_transform(data)\n",
"pd.DataFrame(\n",
" new_data,\n",
" columns=data.columns\n",
")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 507
},
"id": "q4KFEczX5SYj",
"outputId": "1a85ca09-5dc4-4dfc-9ccb-4aca2776bc21"
},
"execution_count": 92,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/sklearn/base.py:1473: ConvergenceWarning: Number of distinct clusters (3) found smaller than n_clusters (10). Possibly due to duplicate points in X.\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 4 are removed. Consider decreasing the number of bins.\n",
" warnings.warn(\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
" SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
"0 2.0 6.0 0.0 0.0 0.0\n",
"1 1.0 4.0 0.0 0.0 0.0\n",
"2 1.0 4.0 0.0 0.0 0.0\n",
"3 0.0 4.0 0.0 0.0 0.0\n",
"4 1.0 6.0 0.0 0.0 0.0\n",
".. ... ... ... ... ...\n",
"145 6.0 4.0 5.0 9.0 2.0\n",
"146 5.0 2.0 5.0 7.0 2.0\n",
"147 6.0 4.0 5.0 7.0 2.0\n",
"148 5.0 5.0 6.0 9.0 2.0\n",
"149 4.0 4.0 5.0 7.0 2.0\n",
"\n",
"[150 rows x 5 columns]"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" SepalLengthCm \n",
" SepalWidthCm \n",
" PetalLengthCm \n",
" PetalWidthCm \n",
" Species \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 2.0 \n",
" 6.0 \n",
" 0.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 1 \n",
" 1.0 \n",
" 4.0 \n",
" 0.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 2 \n",
" 1.0 \n",
" 4.0 \n",
" 0.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 3 \n",
" 0.0 \n",
" 4.0 \n",
" 0.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" 4 \n",
" 1.0 \n",
" 6.0 \n",
" 0.0 \n",
" 0.0 \n",
" 0.0 \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 145 \n",
" 6.0 \n",
" 4.0 \n",
" 5.0 \n",
" 9.0 \n",
" 2.0 \n",
" \n",
" \n",
" 146 \n",
" 5.0 \n",
" 2.0 \n",
" 5.0 \n",
" 7.0 \n",
" 2.0 \n",
" \n",
" \n",
" 147 \n",
" 6.0 \n",
" 4.0 \n",
" 5.0 \n",
" 7.0 \n",
" 2.0 \n",
" \n",
" \n",
" 148 \n",
" 5.0 \n",
" 5.0 \n",
" 6.0 \n",
" 9.0 \n",
" 2.0 \n",
" \n",
" \n",
" 149 \n",
" 4.0 \n",
" 4.0 \n",
" 5.0 \n",
" 7.0 \n",
" 2.0 \n",
" \n",
" \n",
"
\n",
"
150 rows × 5 columns
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.3191065162163307,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 1.0,\n 7.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.7795256135824453,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 2.0,\n 4.0,\n 8.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.551842778463346,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 8.0,\n 1.0,\n 2.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.1587455386713343,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 1.0,\n 6.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.8192319205190405,\n \"min\": 0.0,\n \"max\": 2.0,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.0,\n 1.0,\n 2.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 92
}
]
},
{
"cell_type": "markdown",
"source": [
"## In this dataset we use Standardization\n"
],
"metadata": {
"id": "q8Zl6Cp65pwQ"
}
},
{
"cell_type": "code",
"source": [
"# Extract features and labels\n",
"\n",
"X = data.drop('Species', axis=1)\n",
"y = data['Species']"
],
"metadata": {
"id": "4Y46_FZP5uQ_"
},
"execution_count": 93,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
],
"metadata": {
"id": "fSEBRAIx6lMx"
},
"execution_count": 94,
"outputs": []
},
{
"cell_type": "code",
"source": [
"sc = StandardScaler()\n",
"X_train = sc.fit_transform(X_train)\n",
"X_test = sc.transform(X_test)"
],
"metadata": {
"id": "cV2TNfaT6m-G"
},
"execution_count": 95,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# 3. Choose a Model\n",
"\n",
"https://en.wikipedia.org/wiki/Logistic_regression"
],
"metadata": {
"id": "prBHDhku7R2I"
}
},
{
"cell_type": "code",
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"model = LogisticRegression()\n",
"model.fit(X_train, y_train)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 80
},
"id": "-6h32lvi62pc",
"outputId": "c62a7a44-2f13-4f9f-e305-cb63918af714"
},
"execution_count": 96,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"LogisticRegression()"
],
"text/html": [
"LogisticRegression() In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
]
},
"metadata": {},
"execution_count": 96
}
]
},
{
"cell_type": "markdown",
"source": [
"# 4. Train the Model\n"
],
"metadata": {
"id": "VTIpDLia9KYN"
}
},
{
"cell_type": "code",
"source": [
"y_pred = model.predict(X_test)"
],
"metadata": {
"id": "SBhwo4cZ85tF"
},
"execution_count": 97,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# 5. Evaluate the Model\n"
],
"metadata": {
"id": "7PghPlKQ9OTQ"
}
},
{
"cell_type": "code",
"source": [
"from sklearn.metrics import accuracy_score, classification_report\n",
"\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"print(f\"Accuracy: {accuracy:.2f}\")\n",
"print(\"Classification Report:\")\n",
"print(classification_report(y_test, y_pred))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "9N89xiJG9FOx",
"outputId": "aea846a4-aad0-440f-f37d-50807fe3fd57"
},
"execution_count": 98,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Accuracy: 1.00\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0 1.00 1.00 1.00 10\n",
" 1 1.00 1.00 1.00 9\n",
" 2 1.00 1.00 1.00 11\n",
"\n",
" accuracy 1.00 30\n",
" macro avg 1.00 1.00 1.00 30\n",
"weighted avg 1.00 1.00 1.00 30\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import joblib\n",
"\n",
"joblib.dump(model, 'iris_logistic_regression_model.pkl')\n",
"print(\"Model saved to iris_logistic_regression_model.pkl\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "LbRKB89f9PeA",
"outputId": "dd884167-32aa-444b-984b-6c2302e6c80f"
},
"execution_count": 99,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Model saved to iris_logistic_regression_model.pkl\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"loaded_model = joblib.load('logistic_regression_model.pkl')\n",
"print(\"Model loaded successfully\")\n",
"\n",
"\n",
"new_predictions = loaded_model.predict(X_test)\n",
"print(new_predictions)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "HxP0rhZa9g-X",
"outputId": "2df09e3e-98a3-4d90-fd48-6007dfe2838f"
},
"execution_count": 100,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Model loaded successfully\n",
"[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"accuracy = accuracy_score(y_test, new_predictions)\n",
"print(f\"Accuracy: {accuracy:.2f}\")\n",
"print(\"Classification Report:\")\n",
"print(classification_report(y_test, new_predictions))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xWcxawsV9jA5",
"outputId": "f22a4df2-1014-4c94-d802-ce1f2d9ea836"
},
"execution_count": 101,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Accuracy: 1.00\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0 1.00 1.00 1.00 10\n",
" 1 1.00 1.00 1.00 9\n",
" 2 1.00 1.00 1.00 11\n",
"\n",
" accuracy 1.00 30\n",
" macro avg 1.00 1.00 1.00 30\n",
"weighted avg 1.00 1.00 1.00 30\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "etijgWI29oUs"
},
"execution_count": 101,
"outputs": []
}
]
}