{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import seaborn as sns\n", "# \n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.preprocessing import OrdinalEncoder\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.preprocessing import QuantileTransformer" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "url='https://raw.githubusercontent.com/digipodium/Datasets/main/regression/diamonds.csv'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | carat | \n", "cut | \n", "color | \n", "clarity | \n", "depth | \n", "table | \n", "price | \n", "x | \n", "y | \n", "z | \n", "
---|---|---|---|---|---|---|---|---|---|---|
1 | \n", "0.23 | \n", "Ideal | \n", "E | \n", "SI2 | \n", "61.5 | \n", "55.0 | \n", "326 | \n", "3.95 | \n", "3.98 | \n", "2.43 | \n", "
2 | \n", "0.21 | \n", "Premium | \n", "E | \n", "SI1 | \n", "59.8 | \n", "61.0 | \n", "326 | \n", "3.89 | \n", "3.84 | \n", "2.31 | \n", "
3 | \n", "0.23 | \n", "Good | \n", "E | \n", "VS1 | \n", "56.9 | \n", "65.0 | \n", "327 | \n", "4.05 | \n", "4.07 | \n", "2.31 | \n", "
4 | \n", "0.29 | \n", "Premium | \n", "I | \n", "VS2 | \n", "62.4 | \n", "58.0 | \n", "334 | \n", "4.20 | \n", "4.23 | \n", "2.63 | \n", "
5 | \n", "0.31 | \n", "Good | \n", "J | \n", "SI2 | \n", "63.3 | \n", "58.0 | \n", "335 | \n", "4.34 | \n", "4.35 | \n", "2.75 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
53936 | \n", "0.72 | \n", "Ideal | \n", "D | \n", "SI1 | \n", "60.8 | \n", "57.0 | \n", "2757 | \n", "5.75 | \n", "5.76 | \n", "3.50 | \n", "
53937 | \n", "0.72 | \n", "Good | \n", "D | \n", "SI1 | \n", "63.1 | \n", "55.0 | \n", "2757 | \n", "5.69 | \n", "5.75 | \n", "3.61 | \n", "
53938 | \n", "0.70 | \n", "Very Good | \n", "D | \n", "SI1 | \n", "62.8 | \n", "60.0 | \n", "2757 | \n", "5.66 | \n", "5.68 | \n", "3.56 | \n", "
53939 | \n", "0.86 | \n", "Premium | \n", "H | \n", "SI2 | \n", "61.0 | \n", "58.0 | \n", "2757 | \n", "6.15 | \n", "6.12 | \n", "3.74 | \n", "
53940 | \n", "0.75 | \n", "Ideal | \n", "D | \n", "SI2 | \n", "62.2 | \n", "55.0 | \n", "2757 | \n", "5.83 | \n", "5.87 | \n", "3.64 | \n", "
53940 rows × 10 columns
\n", "\n", " | carat | \n", "depth | \n", "table | \n", "price | \n", "x | \n", "y | \n", "z | \n", "
---|---|---|---|---|---|---|---|
count | \n", "53940.000000 | \n", "53940.000000 | \n", "53940.000000 | \n", "53940.000000 | \n", "53940.000000 | \n", "53940.000000 | \n", "53940.000000 | \n", "
mean | \n", "0.797940 | \n", "61.749405 | \n", "57.457184 | \n", "3932.799722 | \n", "5.731157 | \n", "5.734526 | \n", "3.538734 | \n", "
std | \n", "0.474011 | \n", "1.432621 | \n", "2.234491 | \n", "3989.439738 | \n", "1.121761 | \n", "1.142135 | \n", "0.705699 | \n", "
min | \n", "0.200000 | \n", "43.000000 | \n", "43.000000 | \n", "326.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
25% | \n", "0.400000 | \n", "61.000000 | \n", "56.000000 | \n", "950.000000 | \n", "4.710000 | \n", "4.720000 | \n", "2.910000 | \n", "
50% | \n", "0.700000 | \n", "61.800000 | \n", "57.000000 | \n", "2401.000000 | \n", "5.700000 | \n", "5.710000 | \n", "3.530000 | \n", "
75% | \n", "1.040000 | \n", "62.500000 | \n", "59.000000 | \n", "5324.250000 | \n", "6.540000 | \n", "6.540000 | \n", "4.040000 | \n", "
max | \n", "5.010000 | \n", "79.000000 | \n", "95.000000 | \n", "18823.000000 | \n", "10.740000 | \n", "58.900000 | \n", "31.800000 | \n", "
ColumnTransformer(transformers=[('categorical',\n", " Pipeline(steps=[('oe', OrdinalEncoder())]),\n", " Index(['cut', 'color', 'clarity'], dtype='object')),\n", " ('numerical',\n", " Pipeline(steps=[('sc', StandardScaler())]),\n", " Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
ColumnTransformer(transformers=[('categorical',\n", " Pipeline(steps=[('oe', OrdinalEncoder())]),\n", " Index(['cut', 'color', 'clarity'], dtype='object')),\n", " ('numerical',\n", " Pipeline(steps=[('sc', StandardScaler())]),\n", " Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))])
Index(['cut', 'color', 'clarity'], dtype='object')
OrdinalEncoder()
Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')
StandardScaler()
Pipeline(steps=[('transformer',\n", " ColumnTransformer(transformers=[('categorical',\n", " Pipeline(steps=[('oe',\n", " OrdinalEncoder())]),\n", " Index(['cut', 'color', 'clarity'], dtype='object')),\n", " ('numerical',\n", " Pipeline(steps=[('sc',\n", " StandardScaler())]),\n", " Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))])),\n", " ('model', RandomForestRegressor(max_depth=10))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('transformer',\n", " ColumnTransformer(transformers=[('categorical',\n", " Pipeline(steps=[('oe',\n", " OrdinalEncoder())]),\n", " Index(['cut', 'color', 'clarity'], dtype='object')),\n", " ('numerical',\n", " Pipeline(steps=[('sc',\n", " StandardScaler())]),\n", " Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))])),\n", " ('model', RandomForestRegressor(max_depth=10))])
ColumnTransformer(transformers=[('categorical',\n", " Pipeline(steps=[('oe', OrdinalEncoder())]),\n", " Index(['cut', 'color', 'clarity'], dtype='object')),\n", " ('numerical',\n", " Pipeline(steps=[('sc', StandardScaler())]),\n", " Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))])
Index(['cut', 'color', 'clarity'], dtype='object')
OrdinalEncoder()
Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')
StandardScaler()
RandomForestRegressor(max_depth=10)
\n", " | carat | \n", "cut | \n", "color | \n", "clarity | \n", "depth | \n", "table | \n", "x | \n", "y | \n", "z | \n", "
---|---|---|---|---|---|---|---|---|---|
1 | \n", "0.23 | \n", "Ideal | \n", "E | \n", "SI2 | \n", "61.5 | \n", "55.0 | \n", "3.95 | \n", "3.98 | \n", "2.43 | \n", "
2 | \n", "0.21 | \n", "Premium | \n", "E | \n", "SI1 | \n", "59.8 | \n", "61.0 | \n", "3.89 | \n", "3.84 | \n", "2.31 | \n", "
3 | \n", "0.23 | \n", "Good | \n", "E | \n", "VS1 | \n", "56.9 | \n", "65.0 | \n", "4.05 | \n", "4.07 | \n", "2.31 | \n", "
4 | \n", "0.29 | \n", "Premium | \n", "I | \n", "VS2 | \n", "62.4 | \n", "58.0 | \n", "4.20 | \n", "4.23 | \n", "2.63 | \n", "
5 | \n", "0.31 | \n", "Good | \n", "J | \n", "SI2 | \n", "63.3 | \n", "58.0 | \n", "4.34 | \n", "4.35 | \n", "2.75 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
53936 | \n", "0.72 | \n", "Ideal | \n", "D | \n", "SI1 | \n", "60.8 | \n", "57.0 | \n", "5.75 | \n", "5.76 | \n", "3.50 | \n", "
53937 | \n", "0.72 | \n", "Good | \n", "D | \n", "SI1 | \n", "63.1 | \n", "55.0 | \n", "5.69 | \n", "5.75 | \n", "3.61 | \n", "
53938 | \n", "0.70 | \n", "Very Good | \n", "D | \n", "SI1 | \n", "62.8 | \n", "60.0 | \n", "5.66 | \n", "5.68 | \n", "3.56 | \n", "
53939 | \n", "0.86 | \n", "Premium | \n", "H | \n", "SI2 | \n", "61.0 | \n", "58.0 | \n", "6.15 | \n", "6.12 | \n", "3.74 | \n", "
53940 | \n", "0.75 | \n", "Ideal | \n", "D | \n", "SI2 | \n", "62.2 | \n", "55.0 | \n", "5.83 | \n", "5.87 | \n", "3.64 | \n", "
53940 rows × 9 columns
\n", "\n", " | carat | \n", "depth | \n", "table | \n", "x | \n", "y | \n", "z | \n", "
---|---|---|---|---|---|---|
count | \n", "53940.000000 | \n", "53940.000000 | \n", "53940.000000 | \n", "53940.000000 | \n", "53940.000000 | \n", "53940.000000 | \n", "
mean | \n", "0.797940 | \n", "61.749405 | \n", "57.457184 | \n", "5.731157 | \n", "5.734526 | \n", "3.538734 | \n", "
std | \n", "0.474011 | \n", "1.432621 | \n", "2.234491 | \n", "1.121761 | \n", "1.142135 | \n", "0.705699 | \n", "
min | \n", "0.200000 | \n", "43.000000 | \n", "43.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
25% | \n", "0.400000 | \n", "61.000000 | \n", "56.000000 | \n", "4.710000 | \n", "4.720000 | \n", "2.910000 | \n", "
50% | \n", "0.700000 | \n", "61.800000 | \n", "57.000000 | \n", "5.700000 | \n", "5.710000 | \n", "3.530000 | \n", "
75% | \n", "1.040000 | \n", "62.500000 | \n", "59.000000 | \n", "6.540000 | \n", "6.540000 | \n", "4.040000 | \n", "
max | \n", "5.010000 | \n", "79.000000 | \n", "95.000000 | \n", "10.740000 | \n", "58.900000 | \n", "31.800000 | \n", "
\n", " | cut | \n", "color | \n", "clarity | \n", "
---|---|---|---|
count | \n", "53940 | \n", "53940 | \n", "53940 | \n", "
unique | \n", "5 | \n", "7 | \n", "8 | \n", "
top | \n", "Ideal | \n", "G | \n", "SI1 | \n", "
freq | \n", "21551 | \n", "11292 | \n", "13065 | \n", "