{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data handling\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# Vizualisation (Matplotlib, Plotly, Seaborn, etc. )\n",
    "import matplotlib.pyplot as plt\n",
    "# EDA (pandas-profiling, etc. )\n",
    "...\n",
    "\n",
    "# Feature Processing (Scikit-learn processing, etc. )\n",
    "from sklearn.metrics import mean_squared_error, mean_squared_log_error\n",
    "\n",
    "# Machine Learning (Scikit-learn Estimators, Catboost, LightGBM, etc. )\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.ensemble import ExtraTreesRegressor\n",
    "from xgboost import XGBRegressor\n",
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.compose import ColumnTransformer\n",
    "\n",
    "\n",
    "\n",
    "# Other packages\n",
    "from joblib import dump\n",
    "import os\n",
    "import pickle\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>store_nbr</th>\n",
       "      <th>family</th>\n",
       "      <th>sales</th>\n",
       "      <th>onpromotion</th>\n",
       "      <th>transactions</th>\n",
       "      <th>holiday_type</th>\n",
       "      <th>oil_price</th>\n",
       "      <th>city</th>\n",
       "      <th>cluster</th>\n",
       "      <th>day</th>\n",
       "      <th>year</th>\n",
       "      <th>month</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1970-01-01 00:00:00.000002013</th>\n",
       "      <td>25</td>\n",
       "      <td>AUTOMOTIVE</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>770</td>\n",
       "      <td>Workday</td>\n",
       "      <td>93.14</td>\n",
       "      <td>Salinas</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1970</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1970-01-01 00:00:00.000002013</th>\n",
       "      <td>25</td>\n",
       "      <td>Personal Care</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>770</td>\n",
       "      <td>Workday</td>\n",
       "      <td>93.14</td>\n",
       "      <td>Salinas</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1970</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1970-01-01 00:00:00.000002013</th>\n",
       "      <td>25</td>\n",
       "      <td>Personal Care</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>770</td>\n",
       "      <td>Workday</td>\n",
       "      <td>93.14</td>\n",
       "      <td>Salinas</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1970</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1970-01-01 00:00:00.000002013</th>\n",
       "      <td>25</td>\n",
       "      <td>Beverages</td>\n",
       "      <td>810.0</td>\n",
       "      <td>0</td>\n",
       "      <td>770</td>\n",
       "      <td>Workday</td>\n",
       "      <td>93.14</td>\n",
       "      <td>Salinas</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1970</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1970-01-01 00:00:00.000002013</th>\n",
       "      <td>25</td>\n",
       "      <td>STATIONERY</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>770</td>\n",
       "      <td>Workday</td>\n",
       "      <td>93.14</td>\n",
       "      <td>Salinas</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1970</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               store_nbr         family  sales  onpromotion  \\\n",
       "date                                                                          \n",
       "1970-01-01 00:00:00.000002013         25     AUTOMOTIVE    0.0            0   \n",
       "1970-01-01 00:00:00.000002013         25  Personal Care    0.0            0   \n",
       "1970-01-01 00:00:00.000002013         25  Personal Care    2.0            0   \n",
       "1970-01-01 00:00:00.000002013         25      Beverages  810.0            0   \n",
       "1970-01-01 00:00:00.000002013         25     STATIONERY    0.0            0   \n",
       "\n",
       "                               transactions holiday_type  oil_price     city  \\\n",
       "date                                                                           \n",
       "1970-01-01 00:00:00.000002013           770      Workday      93.14  Salinas   \n",
       "1970-01-01 00:00:00.000002013           770      Workday      93.14  Salinas   \n",
       "1970-01-01 00:00:00.000002013           770      Workday      93.14  Salinas   \n",
       "1970-01-01 00:00:00.000002013           770      Workday      93.14  Salinas   \n",
       "1970-01-01 00:00:00.000002013           770      Workday      93.14  Salinas   \n",
       "\n",
       "                               cluster  day  year  month  \n",
       "date                                                      \n",
       "1970-01-01 00:00:00.000002013        1    1  1970      1  \n",
       "1970-01-01 00:00:00.000002013        1    1  1970      1  \n",
       "1970-01-01 00:00:00.000002013        1    1  1970      1  \n",
       "1970-01-01 00:00:00.000002013        1    1  1970      1  \n",
       "1970-01-01 00:00:00.000002013        1    1  1970      1  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_csv('R2data.csv')\n",
    "data.drop(columns=['Unnamed: 0'], inplace=True)\n",
    "\n",
    "# Convert the date column to a datetime object\n",
    "data['date'] = pd.to_datetime(data['date'])\n",
    "\n",
    "# Set the date column as the index\n",
    "data = data.set_index('date')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = data['sales']                         # Target Variable\n",
    "X = data.drop('sales', axis = 1)          # Independent Variable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "numeric_transformer = Pipeline(steps = [('num_imputer',SimpleImputer(strategy = 'mean')),('scaler',StandardScaler())])\n",
    "categorical_transformer = Pipeline(steps = [('cat_imputer',SimpleImputer(strategy ='most_frequent')),('one-hot',OneHotEncoder(handle_unknown='ignore', sparse=False))])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_feature =[\"family\", \"city\", \"holiday_type\"]\n",
    "numeric_feature = ['store_nbr', 'onpromotion', 'transactions', 'oil_price', 'cluster','year', 'month']\n",
    "preprocessor = ColumnTransformer(transformers=[('numeric_transformer',numeric_transformer,numeric_feature),('categorical_transformer',categorical_transformer,categorical_feature)],remainder='drop')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split the data into training and test sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "rf = GradientBoostingRegressor(n_estimators=100, random_state=42)\n",
    "\n",
    "rf = Pipeline(steps=[('preprocessor',preprocessor),('estimator',rf)])\n",
    "rf.fit(X_train, y_train)\n",
    "\n",
    "# Make prediction on X_test\n",
    "rf_predictions = rf.predict(X_test)\n",
    "\n",
    "\n",
    "# Evaluate our models\n",
    "rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(rf_predictions))).round(2)\n",
    "\n",
    "\n",
    "results = pd.DataFrame([['Gradient Boosting', rmsle]], columns = ['Model', 'RMSLE'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gradient Boosting Regression Model\n",
    "#rf = GradientBoostingRegressor(n_estimators=100, random_state=42)\n",
    "#rf.fit(X_train, y_train)\n",
    "\n",
    "# Make prediction on X_test\n",
    "#rf_predictions = rf.predict(X_test)\n",
    "\n",
    "\n",
    "# Evaluate our models\n",
    "#rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(rf_predictions))).round(2)\n",
    "\n",
    "\n",
    "#results = pd.DataFrame([['Gradient Boosting', rmsle]], columns = ['Model', 'RMSLE'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Model</th>\n",
       "      <th>RMSLE</th>\n",
       "      <th>Model</th>\n",
       "      <th>RMSLE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Gradient Boosting</td>\n",
       "      <td>2.48</td>\n",
       "      <td>Extra Tree</td>\n",
       "      <td>1.93</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               Model  RMSLE       Model  RMSLE\n",
       "0  Gradient Boosting   2.48  Extra Tree   1.93"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Extra Trees Regression Model\n",
    "sg = ExtraTreesRegressor(n_estimators=100, random_state=42)\n",
    "sg = Pipeline(steps=[('preprocessor',preprocessor),('estimator',sg)])\n",
    "sg.fit(X_train, y_train)\n",
    "\n",
    "# Make prediction on X_test\n",
    "sg_predictions = sg.predict(X_test)\n",
    "\n",
    "\n",
    "# Evaluate our models\n",
    "rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(sg_predictions))).round(2)\n",
    "\n",
    "\n",
    "model_results = pd.DataFrame([['Extra Tree', rmsle]], columns = ['Model', 'RMSLE'])\n",
    "results = pd.concat([results, model_results], axis=1)\n",
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Model</th>\n",
       "      <th>RMSLE</th>\n",
       "      <th>Model</th>\n",
       "      <th>RMSLE</th>\n",
       "      <th>Model</th>\n",
       "      <th>RMSLE</th>\n",
       "      <th>Model</th>\n",
       "      <th>RMSLE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Gradient Boosting</td>\n",
       "      <td>2.48</td>\n",
       "      <td>Extra Tree</td>\n",
       "      <td>1.93</td>\n",
       "      <td>Extra Tree</td>\n",
       "      <td>1.93</td>\n",
       "      <td>XGBoost</td>\n",
       "      <td>2.15</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               Model  RMSLE       Model  RMSLE       Model  RMSLE    Model  \\\n",
       "0  Gradient Boosting   2.48  Extra Tree   1.93  Extra Tree   1.93  XGBoost   \n",
       "\n",
       "   RMSLE  \n",
       "0   2.15  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Extra Trees Regression Model\n",
    "xg = XGBRegressor(n_estimators=100, random_state=42)\n",
    "xg = Pipeline(steps=[('preprocessor',preprocessor),('estimator',xg)])\n",
    "xg.fit(X_train, y_train)\n",
    "\n",
    "# Make prediction on X_test\n",
    "xg_predictions = xg.predict(X_test)\n",
    "\n",
    "\n",
    "# Evaluate our models\n",
    "rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(xg_predictions))).round(2)\n",
    "\n",
    "\n",
    "model_result = pd.DataFrame([['XGBoost', rmsle]], columns = ['Model', 'RMSLE'])\n",
    "results = pd.concat([results, model_result], axis=1)\n",
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extra Trees Regression Model\n",
    "#sg = ExtraTreesRegressor(n_estimators=100, random_state=42)\n",
    "#sg.fit(X_train, y_train)\n",
    "\n",
    "# Make prediction on X_test\n",
    "#sg_predictions = sg.predict(X_test)\n",
    "\n",
    "\n",
    "# Evaluate our models\n",
    "#rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(sg_predictions))).round(2)\n",
    "\n",
    "\n",
    "#model_results = pd.DataFrame([['Extra Tree', rmsle]], columns = ['Model', 'RMSLE'])\n",
    "#results = pd.concat([results, model_results], axis=1)\n",
    "#results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model = xg\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set the destination path to the \"export\" directory\n",
    "#destination = \".\"\n",
    "\n",
    "# create a dictionary to store the objects and their filenames\n",
    "#models = {\"numerical_imputer\": numerical_imputer,\n",
    "#          \"categorical_imputer\": categorical_imputer,\n",
    "#          \"scaler\": scaler,\n",
    "#          \"le_family\": le_family,\n",
    "#          \"le_holiday_type\": le_holiday_type,\n",
    "#          \"le_city\": le_city,\n",
    "#          \"Final_model\": best_model}\n",
    "\n",
    "# loop through the models and save them using joblib.dump()\n",
    "#for name, model in models.items():\n",
    "#    dump(model, os.path.join(destination, f\"{name}.joblib\"), compress=(\"lzma\", 5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set the destination path to the \"export\" directory\n",
    "destination = \".\"\n",
    "\n",
    "# create a dictionary to store the objects and their filenames\n",
    "models = {\"Best_model\": best_model}\n",
    "\n",
    "# loop through the models and save them using joblib.dump()\n",
    "for name, model in models.items():\n",
    "    dump(model, os.path.join(destination, f\"{name}.joblib\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Identify numeric and non-numeric columns\n",
    "#num_cols = X.select_dtypes(include=[np.number]).columns.tolist()\n",
    "#cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()\n",
    "\n",
    "# Creating imputer variables\n",
    "#numerical_imputer = SimpleImputer(strategy = \"mean\")\n",
    "#categorical_imputer = SimpleImputer(strategy = \"most_frequent\")\n",
    "\n",
    "#X_cat = X[cat_cols].copy()\n",
    "#X_num = X[num_cols].copy()\n",
    "\n",
    "\n",
    "# Fitting the Imputer\n",
    "#X_cat_imputed = categorical_imputer.fit_transform(X_cat)\n",
    "#X_num_imputed = numerical_imputer.fit_transform(X_num)\n",
    "\n",
    "# Convert NumPy arrays to DataFrames\n",
    "#X_cat_imputed = pd.DataFrame(X_cat_imputed, columns=cat_cols)\n",
    "#X_num_imputed = pd.DataFrame(X_num_imputed, columns=num_cols)\n",
    "\n",
    "\n",
    "#scaler = StandardScaler()\n",
    "\n",
    "#X_num_scaled = scaler.fit_transform(X_num_imputed)\n",
    "#X_num_sc = pd.DataFrame(X_num_scaled, columns = num_cols)\n",
    "\n",
    "\n",
    "\n",
    "# Concatenate the imputed dataframes\n",
    "#X = pd.concat([X_num_sc, X_cat_imputed], axis=1)\n",
    "\n",
    "#le_family = LabelEncoder()\n",
    "#X['family'] = le_family.fit_transform(X['family'])\n",
    "\n",
    "#le_holiday_type = LabelEncoder()\n",
    "#X['holiday_type'] = le_holiday_type.fit_transform(X['holiday_type'])\n",
    "\n",
    "#le_city = LabelEncoder()\n",
    "#X['city'] = le_city.fit_transform(X['city'])\n",
    "\n",
    "#X.info()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}