In [1]:
# Data handling
import pandas as pd
import numpy as np

# Vizualisation (Matplotlib, Plotly, Seaborn, etc. )
import matplotlib.pyplot as plt
# EDA (pandas-profiling, etc. )
...

# Feature Processing (Scikit-learn processing, etc. )
from sklearn.metrics import mean_squared_error, mean_squared_log_error

# Machine Learning (Scikit-learn Estimators, Catboost, LightGBM, etc. )
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer



# Other packages
from joblib import dump
import os
import pickle


In [2]:
data = pd.read_csv('R2data.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)

# Convert the date column to a datetime object
data['date'] = pd.to_datetime(data['date'])

# Set the date column as the index
data = data.set_index('date')
data.head()

Unnamed: 0_level_0,store_nbr,family,sales,onpromotion,transactions,holiday_type,oil_price,city,cluster,day,year,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1970-01-01 00:00:00.000002013,25,AUTOMOTIVE,0.0,0,770,Workday,93.14,Salinas,1,1,1970,1
1970-01-01 00:00:00.000002013,25,Personal Care,0.0,0,770,Workday,93.14,Salinas,1,1,1970,1
1970-01-01 00:00:00.000002013,25,Personal Care,2.0,0,770,Workday,93.14,Salinas,1,1,1970,1
1970-01-01 00:00:00.000002013,25,Beverages,810.0,0,770,Workday,93.14,Salinas,1,1,1970,1
1970-01-01 00:00:00.000002013,25,STATIONERY,0.0,0,770,Workday,93.14,Salinas,1,1,1970,1


In [3]:
y = data['sales']                         # Target Variable
X = data.drop('sales', axis = 1)          # Independent Variable

In [4]:
numeric_transformer = Pipeline(steps = [('num_imputer',SimpleImputer(strategy = 'mean')),('scaler',StandardScaler())])
categorical_transformer = Pipeline(steps = [('cat_imputer',SimpleImputer(strategy ='most_frequent')),('one-hot',OneHotEncoder(handle_unknown='ignore', sparse=False))])

In [5]:
categorical_feature =["family", "city", "holiday_type"]
numeric_feature = ['store_nbr', 'onpromotion', 'transactions', 'oil_price', 'cluster','year', 'month']
preprocessor = ColumnTransformer(transformers=[('numeric_transformer',numeric_transformer,numeric_feature),('categorical_transformer',categorical_transformer,categorical_feature)],remainder='drop')

In [6]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
rf = GradientBoostingRegressor(n_estimators=100, random_state=42)

rf = Pipeline(steps=[('preprocessor',preprocessor),('estimator',rf)])
rf.fit(X_train, y_train)

# Make prediction on X_test
rf_predictions = rf.predict(X_test)


# Evaluate our models
rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(rf_predictions))).round(2)


results = pd.DataFrame([['Gradient Boosting', rmsle]], columns = ['Model', 'RMSLE'])



In [8]:
# Gradient Boosting Regression Model
#rf = GradientBoostingRegressor(n_estimators=100, random_state=42)
#rf.fit(X_train, y_train)

# Make prediction on X_test
#rf_predictions = rf.predict(X_test)


# Evaluate our models
#rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(rf_predictions))).round(2)


#results = pd.DataFrame([['Gradient Boosting', rmsle]], columns = ['Model', 'RMSLE'])

In [9]:
# Extra Trees Regression Model
sg = ExtraTreesRegressor(n_estimators=100, random_state=42)
sg = Pipeline(steps=[('preprocessor',preprocessor),('estimator',sg)])
sg.fit(X_train, y_train)

# Make prediction on X_test
sg_predictions = sg.predict(X_test)


# Evaluate our models
rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(sg_predictions))).round(2)


model_results = pd.DataFrame([['Extra Tree', rmsle]], columns = ['Model', 'RMSLE'])
results = pd.concat([results, model_results], axis=1)
results



Unnamed: 0,Model,RMSLE,Model.1,RMSLE.1
0,Gradient Boosting,2.48,Extra Tree,1.93


In [12]:
# Extra Trees Regression Model
xg = XGBRegressor(n_estimators=100, random_state=42)
xg = Pipeline(steps=[('preprocessor',preprocessor),('estimator',xg)])
xg.fit(X_train, y_train)

# Make prediction on X_test
xg_predictions = xg.predict(X_test)


# Evaluate our models
rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(xg_predictions))).round(2)


model_result = pd.DataFrame([['XGBoost', rmsle]], columns = ['Model', 'RMSLE'])
results = pd.concat([results, model_result], axis=1)
results



Unnamed: 0,Model,RMSLE,Model.1,RMSLE.1,Model.2,RMSLE.2,Model.3,RMSLE.3
0,Gradient Boosting,2.48,Extra Tree,1.93,Extra Tree,1.93,XGBoost,2.15


In [11]:
# Extra Trees Regression Model
#sg = ExtraTreesRegressor(n_estimators=100, random_state=42)
#sg.fit(X_train, y_train)

# Make prediction on X_test
#sg_predictions = sg.predict(X_test)


# Evaluate our models
#rmsle = np.sqrt(mean_squared_log_error(abs(y_test), abs(sg_predictions))).round(2)


#model_results = pd.DataFrame([['Extra Tree', rmsle]], columns = ['Model', 'RMSLE'])
#results = pd.concat([results, model_results], axis=1)
#results

In [13]:
best_model = xg


In [13]:
# set the destination path to the "export" directory
#destination = "."

# create a dictionary to store the objects and their filenames
#models = {"numerical_imputer": numerical_imputer,
#          "categorical_imputer": categorical_imputer,
#          "scaler": scaler,
#          "le_family": le_family,
#          "le_holiday_type": le_holiday_type,
#          "le_city": le_city,
#          "Final_model": best_model}

# loop through the models and save them using joblib.dump()
#for name, model in models.items():
#    dump(model, os.path.join(destination, f"{name}.joblib"), compress=("lzma", 5))

In [14]:
# set the destination path to the "export" directory
destination = "."

# create a dictionary to store the objects and their filenames
models = {"Best_model": best_model}

# loop through the models and save them using joblib.dump()
for name, model in models.items():
    dump(model, os.path.join(destination, f"{name}.joblib"))

In [15]:
# Identify numeric and non-numeric columns
#num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
#cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

# Creating imputer variables
#numerical_imputer = SimpleImputer(strategy = "mean")
#categorical_imputer = SimpleImputer(strategy = "most_frequent")

#X_cat = X[cat_cols].copy()
#X_num = X[num_cols].copy()


# Fitting the Imputer
#X_cat_imputed = categorical_imputer.fit_transform(X_cat)
#X_num_imputed = numerical_imputer.fit_transform(X_num)

# Convert NumPy arrays to DataFrames
#X_cat_imputed = pd.DataFrame(X_cat_imputed, columns=cat_cols)
#X_num_imputed = pd.DataFrame(X_num_imputed, columns=num_cols)


#scaler = StandardScaler()

#X_num_scaled = scaler.fit_transform(X_num_imputed)
#X_num_sc = pd.DataFrame(X_num_scaled, columns = num_cols)



# Concatenate the imputed dataframes
#X = pd.concat([X_num_sc, X_cat_imputed], axis=1)

#le_family = LabelEncoder()
#X['family'] = le_family.fit_transform(X['family'])

#le_holiday_type = LabelEncoder()
#X['holiday_type'] = le_holiday_type.fit_transform(X['holiday_type'])

#le_city = LabelEncoder()
#X['city'] = le_city.fit_transform(X['city'])

#X.info()