In [2]:
# Import packages
import plotly.graph_objects as go
import pandas as pd
from sklearn.model_selection import KFold


def plot_cross_val(
    n_splits: int, splitter_func, df: pd.DataFrame, title_text: str
) -> None:
    """Function to plot the cross validation of various
    sklearn splitter objects."""

    split = 1
    plot_data = []

    for train_index, valid_index in splitter_func(n_splits=n_splits).split(df):
        plot_data.append([train_index, "Train", f"{split}"])
        plot_data.append([valid_index, "Test", f"{split}"])
        split += 1

    plot_df = pd.DataFrame(
        plot_data, columns=["Index", "Dataset", "Split"]
    ).explode("Index")

    fig = go.Figure()
    for _, group in plot_df.groupby("Split"):
        fig.add_trace(
            go.Scatter(
                x=group["Index"].loc[group["Dataset"] == "Train"],
                y=group["Split"].loc[group["Dataset"] == "Train"],
                name="Train",
                line=dict(color="blue", width=10),
            )
        )
        fig.add_trace(
            go.Scatter(
                x=group["Index"].loc[group["Dataset"] == "Test"],
                y=group["Split"].loc[group["Dataset"] == "Test"],
                name="Test",
                line=dict(color="goldenrod", width=10),
            )
        )

    fig.update_layout(
        template="simple_white",
        font=dict(size=20),
        title_text=title_text,
        title_x=0.5,
        width=850,
        height=450,
        xaxis_title="Index",
        yaxis_title="Split",
    )

    legend_names = set()
    fig.for_each_trace(
        lambda trace: trace.update(showlegend=False)
        if (trace.name in legend_names)
        else legend_names.add(trace.name)
    )

    return fig.show()


if __name__ == "__main__":
    # Read in the data
    data = pd.read_csv("../coal-price-data/AirPassengers.csv")
    data["Month"] = pd.to_datetime(data["Month"])

    # Plot the cross validation
    plot_cross_val(
        n_splits=5, splitter_func=KFold, df=data, title_text="Cross-Validation"
    )

In [3]:
# Import packages
from sklearn.model_selection import TimeSeriesSplit

# Plot the time series cross validation splits
plot_cross_val(
    n_splits=5,
    splitter_func=TimeSeriesSplit,
    df=data,
    title_text="Time Series Cross-Validation",
)

In [4]:
# Import packages
import plotly.express as px
import pandas as pd


def plot_time_series(df: pd.DataFrame) -> None:
    """General function to plot the passenger data."""

    fig = px.line(
        df,
        x="Month",
        y="#Passengers",
        labels={"Month": "Date", "#Passengers": "Passengers"},
    )

    fig.update_layout(
        template="simple_white",
        font=dict(size=18),
        title_text="Airline Passengers",
        width=650,
        title_x=0.5,
        height=400,
    )

    return fig.show()


if __name__ == "__main__":
    # Read in the data
    data = pd.read_csv("../coal-price-data/AirPassengers.csv")
    data["Month"] = pd.to_datetime(data["Month"])

    # Plot the time series
    plot_time_series(df=data)

In [7]:
# Import packages
import plotly.graph_objects as go
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.holtwinters import ExponentialSmoothing


def hyperparameter_tuning_season_cv(
    n_splits: int, gammas: list[float], df: pd.DataFrame
) -> pd.DataFrame:
    """Function to carry out cross-validation hyperparameter tuning
    for the seasonal parameter in a Holt Winters' model."""

    tscv = TimeSeriesSplit(n_splits=n_splits)
    error_list = []

    for gamma in gammas:
        errors = []

        for train_index, valid_index in tscv.split(df):
            train, valid = df.iloc[train_index], df.iloc[valid_index]

            model = ExponentialSmoothing(
                train["#Passengers"],
                trend="mul",
                seasonal="mul",
                seasonal_periods=12,
            ).fit(smoothing_seasonal=gamma)

            forecasts = model.forecast(len(valid))
            errors.append(
                mean_absolute_percentage_error(valid["#Passengers"], forecasts)
            )

        error_list.append([gamma, sum(errors) / len(errors)])

    return pd.DataFrame(error_list, columns=["Gamma", "MAPE"])


def plot_error_cv(df: pd.DataFrame, title: str) -> None:
    """Bar chart to plot the errors from the different
    hyperparameters."""

    fig = px.bar(df, x="Gamma", y="MAPE")
    fig.update_layout(
        template="simple_white",
        font=dict(size=18),
        title_text=title,
        width=800,
        title_x=0.5,
        height=400,
    )

    return fig.show()

In [9]:
# Read in the data
data = pd.read_csv("../coal-price-data/AirPassengers.csv")
data["Month"] = pd.to_datetime(data["Month"])

In [10]:
# Carry out cv for hyperparameter tuning for the seasonal parameter
error_df = hyperparameter_tuning_season_cv(
    df=data, n_splits=4, gammas=list(np.arange(0, 1.1, 0.1))
)


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul


overflow encountered in matmul



In [11]:
error_df

Unnamed: 0,Gamma,MAPE
0,0.0,0.048897
1,0.1,0.051128
2,0.2,0.052672
3,0.3,0.053538
4,0.4,0.052919
5,0.5,0.047107
6,0.6,0.044867
7,0.7,0.044603
8,0.8,0.046695
9,0.9,0.048922


In [12]:
# Plot the tuning results
plot_error_cv(df=error_df, title="Hyperparameter Results")