diff --git "a/training.ipynb" "b/training.ipynb" new file mode 100644--- /dev/null +++ "b/training.ipynb" @@ -0,0 +1,1353 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import seaborn as sns\n", + "# \n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.preprocessing import OrdinalEncoder\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.preprocessing import QuantileTransformer" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "url='https://raw.githubusercontent.com/digipodium/Datasets/main/regression/diamonds.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | carat | \n", + "cut | \n", + "color | \n", + "clarity | \n", + "depth | \n", + "table | \n", + "price | \n", + "x | \n", + "y | \n", + "z | \n", + "
---|---|---|---|---|---|---|---|---|---|---|
1 | \n", + "0.23 | \n", + "Ideal | \n", + "E | \n", + "SI2 | \n", + "61.5 | \n", + "55.0 | \n", + "326 | \n", + "3.95 | \n", + "3.98 | \n", + "2.43 | \n", + "
2 | \n", + "0.21 | \n", + "Premium | \n", + "E | \n", + "SI1 | \n", + "59.8 | \n", + "61.0 | \n", + "326 | \n", + "3.89 | \n", + "3.84 | \n", + "2.31 | \n", + "
3 | \n", + "0.23 | \n", + "Good | \n", + "E | \n", + "VS1 | \n", + "56.9 | \n", + "65.0 | \n", + "327 | \n", + "4.05 | \n", + "4.07 | \n", + "2.31 | \n", + "
4 | \n", + "0.29 | \n", + "Premium | \n", + "I | \n", + "VS2 | \n", + "62.4 | \n", + "58.0 | \n", + "334 | \n", + "4.20 | \n", + "4.23 | \n", + "2.63 | \n", + "
5 | \n", + "0.31 | \n", + "Good | \n", + "J | \n", + "SI2 | \n", + "63.3 | \n", + "58.0 | \n", + "335 | \n", + "4.34 | \n", + "4.35 | \n", + "2.75 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
53936 | \n", + "0.72 | \n", + "Ideal | \n", + "D | \n", + "SI1 | \n", + "60.8 | \n", + "57.0 | \n", + "2757 | \n", + "5.75 | \n", + "5.76 | \n", + "3.50 | \n", + "
53937 | \n", + "0.72 | \n", + "Good | \n", + "D | \n", + "SI1 | \n", + "63.1 | \n", + "55.0 | \n", + "2757 | \n", + "5.69 | \n", + "5.75 | \n", + "3.61 | \n", + "
53938 | \n", + "0.70 | \n", + "Very Good | \n", + "D | \n", + "SI1 | \n", + "62.8 | \n", + "60.0 | \n", + "2757 | \n", + "5.66 | \n", + "5.68 | \n", + "3.56 | \n", + "
53939 | \n", + "0.86 | \n", + "Premium | \n", + "H | \n", + "SI2 | \n", + "61.0 | \n", + "58.0 | \n", + "2757 | \n", + "6.15 | \n", + "6.12 | \n", + "3.74 | \n", + "
53940 | \n", + "0.75 | \n", + "Ideal | \n", + "D | \n", + "SI2 | \n", + "62.2 | \n", + "55.0 | \n", + "2757 | \n", + "5.83 | \n", + "5.87 | \n", + "3.64 | \n", + "
53940 rows × 10 columns
\n", + "\n", + " | carat | \n", + "depth | \n", + "table | \n", + "price | \n", + "x | \n", + "y | \n", + "z | \n", + "
---|---|---|---|---|---|---|---|
count | \n", + "53940.000000 | \n", + "53940.000000 | \n", + "53940.000000 | \n", + "53940.000000 | \n", + "53940.000000 | \n", + "53940.000000 | \n", + "53940.000000 | \n", + "
mean | \n", + "0.797940 | \n", + "61.749405 | \n", + "57.457184 | \n", + "3932.799722 | \n", + "5.731157 | \n", + "5.734526 | \n", + "3.538734 | \n", + "
std | \n", + "0.474011 | \n", + "1.432621 | \n", + "2.234491 | \n", + "3989.439738 | \n", + "1.121761 | \n", + "1.142135 | \n", + "0.705699 | \n", + "
min | \n", + "0.200000 | \n", + "43.000000 | \n", + "43.000000 | \n", + "326.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
25% | \n", + "0.400000 | \n", + "61.000000 | \n", + "56.000000 | \n", + "950.000000 | \n", + "4.710000 | \n", + "4.720000 | \n", + "2.910000 | \n", + "
50% | \n", + "0.700000 | \n", + "61.800000 | \n", + "57.000000 | \n", + "2401.000000 | \n", + "5.700000 | \n", + "5.710000 | \n", + "3.530000 | \n", + "
75% | \n", + "1.040000 | \n", + "62.500000 | \n", + "59.000000 | \n", + "5324.250000 | \n", + "6.540000 | \n", + "6.540000 | \n", + "4.040000 | \n", + "
max | \n", + "5.010000 | \n", + "79.000000 | \n", + "95.000000 | \n", + "18823.000000 | \n", + "10.740000 | \n", + "58.900000 | \n", + "31.800000 | \n", + "
ColumnTransformer(transformers=[('categorical',\n", + " Pipeline(steps=[('oe', OrdinalEncoder())]),\n", + " Index(['cut', 'color', 'clarity'], dtype='object')),\n", + " ('numerical',\n", + " Pipeline(steps=[('sc', StandardScaler())]),\n", + " Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
ColumnTransformer(transformers=[('categorical',\n", + " Pipeline(steps=[('oe', OrdinalEncoder())]),\n", + " Index(['cut', 'color', 'clarity'], dtype='object')),\n", + " ('numerical',\n", + " Pipeline(steps=[('sc', StandardScaler())]),\n", + " Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))])
Index(['cut', 'color', 'clarity'], dtype='object')
OrdinalEncoder()
Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')
StandardScaler()
Pipeline(steps=[('transformer',\n", + " ColumnTransformer(transformers=[('categorical',\n", + " Pipeline(steps=[('oe',\n", + " OrdinalEncoder())]),\n", + " Index(['cut', 'color', 'clarity'], dtype='object')),\n", + " ('numerical',\n", + " Pipeline(steps=[('sc',\n", + " StandardScaler())]),\n", + " Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))])),\n", + " ('model', RandomForestRegressor(max_depth=10))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('transformer',\n", + " ColumnTransformer(transformers=[('categorical',\n", + " Pipeline(steps=[('oe',\n", + " OrdinalEncoder())]),\n", + " Index(['cut', 'color', 'clarity'], dtype='object')),\n", + " ('numerical',\n", + " Pipeline(steps=[('sc',\n", + " StandardScaler())]),\n", + " Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))])),\n", + " ('model', RandomForestRegressor(max_depth=10))])
ColumnTransformer(transformers=[('categorical',\n", + " Pipeline(steps=[('oe', OrdinalEncoder())]),\n", + " Index(['cut', 'color', 'clarity'], dtype='object')),\n", + " ('numerical',\n", + " Pipeline(steps=[('sc', StandardScaler())]),\n", + " Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))])
Index(['cut', 'color', 'clarity'], dtype='object')
OrdinalEncoder()
Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')
StandardScaler()
RandomForestRegressor(max_depth=10)
\n", + " | carat | \n", + "cut | \n", + "color | \n", + "clarity | \n", + "depth | \n", + "table | \n", + "x | \n", + "y | \n", + "z | \n", + "
---|---|---|---|---|---|---|---|---|---|
1 | \n", + "0.23 | \n", + "Ideal | \n", + "E | \n", + "SI2 | \n", + "61.5 | \n", + "55.0 | \n", + "3.95 | \n", + "3.98 | \n", + "2.43 | \n", + "
2 | \n", + "0.21 | \n", + "Premium | \n", + "E | \n", + "SI1 | \n", + "59.8 | \n", + "61.0 | \n", + "3.89 | \n", + "3.84 | \n", + "2.31 | \n", + "
3 | \n", + "0.23 | \n", + "Good | \n", + "E | \n", + "VS1 | \n", + "56.9 | \n", + "65.0 | \n", + "4.05 | \n", + "4.07 | \n", + "2.31 | \n", + "
4 | \n", + "0.29 | \n", + "Premium | \n", + "I | \n", + "VS2 | \n", + "62.4 | \n", + "58.0 | \n", + "4.20 | \n", + "4.23 | \n", + "2.63 | \n", + "
5 | \n", + "0.31 | \n", + "Good | \n", + "J | \n", + "SI2 | \n", + "63.3 | \n", + "58.0 | \n", + "4.34 | \n", + "4.35 | \n", + "2.75 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
53936 | \n", + "0.72 | \n", + "Ideal | \n", + "D | \n", + "SI1 | \n", + "60.8 | \n", + "57.0 | \n", + "5.75 | \n", + "5.76 | \n", + "3.50 | \n", + "
53937 | \n", + "0.72 | \n", + "Good | \n", + "D | \n", + "SI1 | \n", + "63.1 | \n", + "55.0 | \n", + "5.69 | \n", + "5.75 | \n", + "3.61 | \n", + "
53938 | \n", + "0.70 | \n", + "Very Good | \n", + "D | \n", + "SI1 | \n", + "62.8 | \n", + "60.0 | \n", + "5.66 | \n", + "5.68 | \n", + "3.56 | \n", + "
53939 | \n", + "0.86 | \n", + "Premium | \n", + "H | \n", + "SI2 | \n", + "61.0 | \n", + "58.0 | \n", + "6.15 | \n", + "6.12 | \n", + "3.74 | \n", + "
53940 | \n", + "0.75 | \n", + "Ideal | \n", + "D | \n", + "SI2 | \n", + "62.2 | \n", + "55.0 | \n", + "5.83 | \n", + "5.87 | \n", + "3.64 | \n", + "
53940 rows × 9 columns
\n", + "\n", + " | carat | \n", + "depth | \n", + "table | \n", + "x | \n", + "y | \n", + "z | \n", + "
---|---|---|---|---|---|---|
count | \n", + "53940.000000 | \n", + "53940.000000 | \n", + "53940.000000 | \n", + "53940.000000 | \n", + "53940.000000 | \n", + "53940.000000 | \n", + "
mean | \n", + "0.797940 | \n", + "61.749405 | \n", + "57.457184 | \n", + "5.731157 | \n", + "5.734526 | \n", + "3.538734 | \n", + "
std | \n", + "0.474011 | \n", + "1.432621 | \n", + "2.234491 | \n", + "1.121761 | \n", + "1.142135 | \n", + "0.705699 | \n", + "
min | \n", + "0.200000 | \n", + "43.000000 | \n", + "43.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
25% | \n", + "0.400000 | \n", + "61.000000 | \n", + "56.000000 | \n", + "4.710000 | \n", + "4.720000 | \n", + "2.910000 | \n", + "
50% | \n", + "0.700000 | \n", + "61.800000 | \n", + "57.000000 | \n", + "5.700000 | \n", + "5.710000 | \n", + "3.530000 | \n", + "
75% | \n", + "1.040000 | \n", + "62.500000 | \n", + "59.000000 | \n", + "6.540000 | \n", + "6.540000 | \n", + "4.040000 | \n", + "
max | \n", + "5.010000 | \n", + "79.000000 | \n", + "95.000000 | \n", + "10.740000 | \n", + "58.900000 | \n", + "31.800000 | \n", + "
\n", + " | cut | \n", + "color | \n", + "clarity | \n", + "
---|---|---|---|
count | \n", + "53940 | \n", + "53940 | \n", + "53940 | \n", + "
unique | \n", + "5 | \n", + "7 | \n", + "8 | \n", + "
top | \n", + "Ideal | \n", + "G | \n", + "SI1 | \n", + "
freq | \n", + "21551 | \n", + "11292 | \n", + "13065 | \n", + "