{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Data handling\n", "import pandas as pd\n", "import numpy as np\n", "\n", "# Vizualisation (Matplotlib, Plotly, Seaborn, etc. )\n", "import matplotlib.pyplot as plt\n", "# EDA (pandas-profiling, etc. )\n", "...\n", "\n", "# Feature Processing (Scikit-learn processing, etc. )\n", "from sklearn.metrics import mean_squared_error, mean_squared_log_error\n", "\n", "# Machine Learning (Scikit-learn Estimators, Catboost, LightGBM, etc. )\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.ensemble import ExtraTreesRegressor\n", "from xgboost import XGBRegressor\n", "from sklearn.ensemble import GradientBoostingRegressor\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "\n", "\n", "\n", "# Other packages\n", "from joblib import dump\n", "import os\n", "import pickle\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
store_nbrfamilysalesonpromotiontransactionsholiday_typeoil_pricecityclusterdayyearmonth
date
1970-01-01 00:00:00.00000201325AUTOMOTIVE0.00770Workday93.14Salinas1119701
1970-01-01 00:00:00.00000201325Personal Care0.00770Workday93.14Salinas1119701
1970-01-01 00:00:00.00000201325Personal Care2.00770Workday93.14Salinas1119701
1970-01-01 00:00:00.00000201325Beverages810.00770Workday93.14Salinas1119701
1970-01-01 00:00:00.00000201325STATIONERY0.00770Workday93.14Salinas1119701
\n", "
" ], "text/plain": [ " store_nbr family sales onpromotion \\\n", "date \n", "1970-01-01 00:00:00.000002013 25 AUTOMOTIVE 0.0 0 \n", "1970-01-01 00:00:00.000002013 25 Personal Care 0.0 0 \n", "1970-01-01 00:00:00.000002013 25 Personal Care 2.0 0 \n", "1970-01-01 00:00:00.000002013 25 Beverages 810.0 0 \n", "1970-01-01 00:00:00.000002013 25 STATIONERY 0.0 0 \n", "\n", " transactions holiday_type oil_price city \\\n", "date \n", "1970-01-01 00:00:00.000002013 770 Workday 93.14 Salinas \n", "1970-01-01 00:00:00.000002013 770 Workday 93.14 Salinas \n", "1970-01-01 00:00:00.000002013 770 Workday 93.14 Salinas \n", "1970-01-01 00:00:00.000002013 770 Workday 93.14 Salinas \n", "1970-01-01 00:00:00.000002013 770 Workday 93.14 Salinas \n", "\n", " cluster day year month \n", "date \n", "1970-01-01 00:00:00.000002013 1 1 1970 1 \n", "1970-01-01 00:00:00.000002013 1 1 1970 1 \n", "1970-01-01 00:00:00.000002013 1 1 1970 1 \n", "1970-01-01 00:00:00.000002013 1 1 1970 1 \n", "1970-01-01 00:00:00.000002013 1 1 1970 1 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv('R2data.csv')\n", "data.drop(columns=['Unnamed: 0'], inplace=True)\n", "\n", "# Convert the date column to a datetime object\n", "data['date'] = pd.to_datetime(data['date'])\n", "\n", "# Set the date column as the index\n", "data = data.set_index('date')\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "y = data['sales'] # Target Variable\n", "X = data.drop('sales', axis = 1) # Independent Variable" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "numeric_transformer = Pipeline(steps = [('num_imputer',SimpleImputer(strategy = 'mean')),('scaler',StandardScaler())])\n", "categorical_transformer = Pipeline(steps = [('cat_imputer',SimpleImputer(strategy ='most_frequent')),('one-hot',OneHotEncoder(handle_unknown='ignore', sparse=False))])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "categorical_feature =[\"family\", \"city\", \"holiday_type\"]\n", "numeric_feature = ['store_nbr', 'onpromotion', 'transactions', 'oil_price', 'cluster','year', 'month']\n", "preprocessor = ColumnTransformer(transformers=[('numeric_transformer',numeric_transformer,numeric_feature),('categorical_transformer',categorical_transformer,categorical_feature)],remainder='drop')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Split the data into training and test sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n", " warnings.warn(\n" ] } ], "source": [ "rf = GradientBoostingRegressor(n_estimators=100, random_state=42)\n", "\n", "rf = Pipeline(steps=[('preprocessor',preprocessor),('estimator',rf)])\n", "rf.fit(X_train, y_train)\n", "\n", "# Make prediction on X_test\n", "rf_predictions = rf.predict(X_test)\n", "\n", "\n", "# Evaluate our models\n", "rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(rf_predictions))).round(2)\n", "\n", "\n", "results = pd.DataFrame([['Gradient Boosting', rmsle]], columns = ['Model', 'RMSLE'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Gradient Boosting Regression Model\n", "#rf = GradientBoostingRegressor(n_estimators=100, random_state=42)\n", "#rf.fit(X_train, y_train)\n", "\n", "# Make prediction on X_test\n", "#rf_predictions = rf.predict(X_test)\n", "\n", "\n", "# Evaluate our models\n", "#rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(rf_predictions))).round(2)\n", "\n", "\n", "#results = pd.DataFrame([['Gradient Boosting', rmsle]], columns = ['Model', 'RMSLE'])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelRMSLEModelRMSLE
0Gradient Boosting2.48Extra Tree1.93
\n", "
" ], "text/plain": [ " Model RMSLE Model RMSLE\n", "0 Gradient Boosting 2.48 Extra Tree 1.93" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Extra Trees Regression Model\n", "sg = ExtraTreesRegressor(n_estimators=100, random_state=42)\n", "sg = Pipeline(steps=[('preprocessor',preprocessor),('estimator',sg)])\n", "sg.fit(X_train, y_train)\n", "\n", "# Make prediction on X_test\n", "sg_predictions = sg.predict(X_test)\n", "\n", "\n", "# Evaluate our models\n", "rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(sg_predictions))).round(2)\n", "\n", "\n", "model_results = pd.DataFrame([['Extra Tree', rmsle]], columns = ['Model', 'RMSLE'])\n", "results = pd.concat([results, model_results], axis=1)\n", "results" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelRMSLEModelRMSLEModelRMSLEModelRMSLE
0Gradient Boosting2.48Extra Tree1.93Extra Tree1.93XGBoost2.15
\n", "
" ], "text/plain": [ " Model RMSLE Model RMSLE Model RMSLE Model \\\n", "0 Gradient Boosting 2.48 Extra Tree 1.93 Extra Tree 1.93 XGBoost \n", "\n", " RMSLE \n", "0 2.15 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Extra Trees Regression Model\n", "xg = XGBRegressor(n_estimators=100, random_state=42)\n", "xg = Pipeline(steps=[('preprocessor',preprocessor),('estimator',xg)])\n", "xg.fit(X_train, y_train)\n", "\n", "# Make prediction on X_test\n", "xg_predictions = xg.predict(X_test)\n", "\n", "\n", "# Evaluate our models\n", "rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(xg_predictions))).round(2)\n", "\n", "\n", "model_result = pd.DataFrame([['XGBoost', rmsle]], columns = ['Model', 'RMSLE'])\n", "results = pd.concat([results, model_result], axis=1)\n", "results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Extra Trees Regression Model\n", "#sg = ExtraTreesRegressor(n_estimators=100, random_state=42)\n", "#sg.fit(X_train, y_train)\n", "\n", "# Make prediction on X_test\n", "#sg_predictions = sg.predict(X_test)\n", "\n", "\n", "# Evaluate our models\n", "#rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(sg_predictions))).round(2)\n", "\n", "\n", "#model_results = pd.DataFrame([['Extra Tree', rmsle]], columns = ['Model', 'RMSLE'])\n", "#results = pd.concat([results, model_results], axis=1)\n", "#results" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "best_model = xg\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# set the destination path to the \"export\" directory\n", "#destination = \".\"\n", "\n", "# create a dictionary to store the objects and their filenames\n", "#models = {\"numerical_imputer\": numerical_imputer,\n", "# \"categorical_imputer\": categorical_imputer,\n", "# \"scaler\": scaler,\n", "# \"le_family\": le_family,\n", "# \"le_holiday_type\": le_holiday_type,\n", "# \"le_city\": le_city,\n", "# \"Final_model\": best_model}\n", "\n", "# loop through the models and save them using joblib.dump()\n", "#for name, model in models.items():\n", "# dump(model, os.path.join(destination, f\"{name}.joblib\"), compress=(\"lzma\", 5))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# set the destination path to the \"export\" directory\n", "destination = \".\"\n", "\n", "# create a dictionary to store the objects and their filenames\n", "models = {\"Best_model\": best_model}\n", "\n", "# loop through the models and save them using joblib.dump()\n", "for name, model in models.items():\n", " dump(model, os.path.join(destination, f\"{name}.joblib\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Identify numeric and non-numeric columns\n", "#num_cols = X.select_dtypes(include=[np.number]).columns.tolist()\n", "#cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()\n", "\n", "# Creating imputer variables\n", "#numerical_imputer = SimpleImputer(strategy = \"mean\")\n", "#categorical_imputer = SimpleImputer(strategy = \"most_frequent\")\n", "\n", "#X_cat = X[cat_cols].copy()\n", "#X_num = X[num_cols].copy()\n", "\n", "\n", "# Fitting the Imputer\n", "#X_cat_imputed = categorical_imputer.fit_transform(X_cat)\n", "#X_num_imputed = numerical_imputer.fit_transform(X_num)\n", "\n", "# Convert NumPy arrays to DataFrames\n", "#X_cat_imputed = pd.DataFrame(X_cat_imputed, columns=cat_cols)\n", "#X_num_imputed = pd.DataFrame(X_num_imputed, columns=num_cols)\n", "\n", "\n", "#scaler = StandardScaler()\n", "\n", "#X_num_scaled = scaler.fit_transform(X_num_imputed)\n", "#X_num_sc = pd.DataFrame(X_num_scaled, columns = num_cols)\n", "\n", "\n", "\n", "# Concatenate the imputed dataframes\n", "#X = pd.concat([X_num_sc, X_cat_imputed], axis=1)\n", "\n", "#le_family = LabelEncoder()\n", "#X['family'] = le_family.fit_transform(X['family'])\n", "\n", "#le_holiday_type = LabelEncoder()\n", "#X['holiday_type'] = le_holiday_type.fit_transform(X['holiday_type'])\n", "\n", "#le_city = LabelEncoder()\n", "#X['city'] = le_city.fit_transform(X['city'])\n", "\n", "#X.info()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }